75 files changed, 1479 insertions, 594 deletions
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index f669adb9f37..1afb321da3d 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -74,7 +74,7 @@ enum_panorama_types = (
                           "Similar to most fisheye modern lens, takes sensor dimensions into consideration"),
     ('MIRRORBALL', "Mirror Ball", "Uses the mirror ball mapping"),
     ('FISHEYE_LENS_POLYNOMIAL', "Fisheye Lens Polynomial",
-     "Defines the lens projection as polynomial to allow real world camera lenses to be mimicked."),
+     "Defines the lens projection as polynomial to allow real world camera lenses to be mimicked"),
 )
 
 enum_curve_shape = (
@@ -667,6 +667,11 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         description="Use special type BVH optimized for hair (uses more ram but renders faster)",
         default=True,
     )
+    debug_use_compact_bvh: BoolProperty(
+        name="Use Compact BVH",
+        description="Use compact BVH structure (uses less ram but renders slower)",
+        default=True,
+    )
     debug_bvh_time_steps: IntProperty(
         name="BVH Time Steps",
         description="Split BVH primitives by this number of time steps to speed up render time in cost of memory",
@@ -896,27 +901,27 @@ class CyclesCameraSettings(bpy.types.PropertyGroup):
 
     fisheye_polynomial_k0: FloatProperty(
         name="Fisheye Polynomial K0",
-        description="Coefficient K0 of the lens polinomial",
+        description="Coefficient K0 of the lens polynomial",
         default=camera.default_fisheye_polynomial[0], precision=6, step=0.1, subtype='ANGLE',
     )
     fisheye_polynomial_k1: FloatProperty(
         name="Fisheye Polynomial K1",
-        description="Coefficient K1 of the lens polinomial",
+        description="Coefficient K1 of the lens polynomial",
         default=camera.default_fisheye_polynomial[1], precision=6, step=0.1, subtype='ANGLE',
     )
     fisheye_polynomial_k2: FloatProperty(
         name="Fisheye Polynomial K2",
-        description="Coefficient K2 of the lens polinomial",
+        description="Coefficient K2 of the lens polynomial",
         default=camera.default_fisheye_polynomial[2], precision=6, step=0.1, subtype='ANGLE',
     )
     fisheye_polynomial_k3: FloatProperty(
         name="Fisheye Polynomial K3",
-        description="Coefficient K3 of the lens polinomial",
+        description="Coefficient K3 of the lens polynomial",
         default=camera.default_fisheye_polynomial[3], precision=6, step=0.1, subtype='ANGLE',
     )
     fisheye_polynomial_k4: FloatProperty(
         name="Fisheye Polynomial K4",
-        description="Coefficient K4 of the lens polinomial",
+        description="Coefficient K4 of the lens polynomial",
         default=camera.default_fisheye_polynomial[4], precision=6, step=0.1, subtype='ANGLE',
     )
 
@@ -1447,6 +1452,19 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                     num += 1
         return num
 
+    def has_multi_device(self):
+        import _cycles
+        compute_device_type = self.get_compute_device_type()
+        device_list = _cycles.available_devices(compute_device_type)
+        for device in device_list:
+            if device[1] == compute_device_type:
+                continue
+            for dev in self.devices:
+                if dev.use and dev.id == device[2]:
+                    return True
+
+        return False
+
     def has_active_device(self):
         return self.get_num_gpu_devices() > 0
 
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 5b600692152..e4b2fef87c3 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -118,6 +118,12 @@ def use_optix(context):
 
     return (get_device_type(context) == 'OPTIX' and cscene.device == 'GPU')
 
+def use_multi_device(context):
+    cscene = context.scene.cycles
+    if cscene.device != 'GPU':
+        return False
+    return context.preferences.addons[__package__].preferences.has_multi_device()
+
 
 def show_device_active(context):
     cscene = context.scene.cycles
@@ -661,6 +667,10 @@ class CYCLES_RENDER_PT_performance_acceleration_structure(CyclesButtonsPanel, Pa
     bl_label = "Acceleration Structure"
     bl_parent_id = "CYCLES_RENDER_PT_performance"
 
+    @classmethod
+    def poll(cls, context):
+        return not use_optix(context) or use_multi_device(context)
+
     def draw(self, context):
         import _cycles
 
@@ -673,21 +683,33 @@ class CYCLES_RENDER_PT_performance_acceleration_structure(CyclesButtonsPanel, Pa
 
         col = layout.column()
 
-        use_embree = False
+        use_embree = _cycles.with_embree
+
         if use_cpu(context):
-            use_embree = _cycles.with_embree
-            if not use_embree:
+            col.prop(cscene, "debug_use_spatial_splits")
+            if use_embree:
+                col.prop(cscene, "debug_use_compact_bvh")
+            else:
+                sub = col.column()
+                sub.active = not cscene.debug_use_spatial_splits
+                sub.prop(cscene, "debug_bvh_time_steps")
+
+                col.prop(cscene, "debug_use_hair_bvh")
+
                 sub = col.column(align=True)
                 sub.label(text="Cycles built without Embree support")
                 sub.label(text="CPU raytracing performance will be poor")
+        else:
+            col.prop(cscene, "debug_use_spatial_splits")
+            sub = col.column()
+            sub.active = not cscene.debug_use_spatial_splits
+            sub.prop(cscene, "debug_bvh_time_steps")
 
-        col.prop(cscene, "debug_use_spatial_splits")
-        sub = col.column()
-        sub.active = not use_embree
-        sub.prop(cscene, "debug_use_hair_bvh")
-        sub = col.column()
-        sub.active = not cscene.debug_use_spatial_splits and not use_embree
-        sub.prop(cscene, "debug_bvh_time_steps")
+            col.prop(cscene, "debug_use_hair_bvh")
+
+            # CPU is used in addition to a GPU
+            if use_multi_device(context) and use_embree:
+                col.prop(cscene, "debug_use_compact_bvh")
 
 
 class CYCLES_RENDER_PT_performance_final_render(CyclesButtonsPanel, Panel):
diff --git a/intern/cycles/blender/curves.cpp b/intern/cycles/blender/curves.cpp
index 65a02d041cc..102ddf5ee32 100644
--- a/intern/cycles/blender/curves.cpp
+++ b/intern/cycles/blender/curves.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <optional>
+
 #include "blender/sync.h"
 #include "blender/util.h"
 
@@ -624,15 +626,36 @@ void BlenderSync::sync_particle_hair(
   }
 }
 
-#ifdef WITH_HAIR_NODES
-static float4 hair_point_as_float4(BL::HairPoint b_point)
+#ifdef WITH_NEW_CURVES_TYPE
+
+static std::optional<BL::FloatAttribute> find_curves_radius_attribute(BL::Curves b_curves)
 {
-  float4 mP = float3_to_float4(get_float3(b_point.co()));
-  mP.w = b_point.radius();
+  for (BL::Attribute &b_attribute : b_curves.attributes) {
+    if (b_attribute.name() != "radius") {
+      continue;
+    }
+    if (b_attribute.domain() != BL::Attribute::domain_POINT) {
+      continue;
+    }
+    if (b_attribute.data_type() != BL::Attribute::data_type_FLOAT) {
+      continue;
+    }
+    return BL::FloatAttribute{b_attribute};
+  }
+  return std::nullopt;
+}
+
+static float4 hair_point_as_float4(BL::Curves b_curves,
+                                   std::optional<BL::FloatAttribute> b_attr_radius,
+                                   const int index)
+{
+  float4 mP = float3_to_float4(get_float3(b_curves.position_data[index].vector()));
+  mP.w = b_attr_radius ? b_attr_radius->data[index].value() : 0.0f;
   return mP;
 }
 
-static float4 interpolate_hair_points(BL::Hair b_hair,
+static float4 interpolate_hair_points(BL::Curves b_curves,
+                                      std::optional<BL::FloatAttribute> b_attr_radius,
                                       const int first_point_index,
                                       const int num_points,
                                       const float step)
@@ -641,12 +664,12 @@ static float4 interpolate_hair_points(BL::Hair b_hair,
   const int point_a = clamp((int)curve_t, 0, num_points - 1);
   const int point_b = min(point_a + 1, num_points - 1);
   const float t = curve_t - (float)point_a;
-  return lerp(hair_point_as_float4(b_hair.points[first_point_index + point_a]),
-              hair_point_as_float4(b_hair.points[first_point_index + point_b]),
+  return lerp(hair_point_as_float4(b_curves, b_attr_radius, first_point_index + point_a),
+              hair_point_as_float4(b_curves, b_attr_radius, first_point_index + point_b),
               t);
 }
 
-static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair)
+static void export_hair_curves(Scene *scene, Hair *hair, BL::Curves b_curves)
 {
   /* TODO: optimize so we can straight memcpy arrays from Blender? */
 
@@ -666,17 +689,19 @@ static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair)
   }
 
   /* Reserve memory. */
-  const int num_keys = b_hair.points.length();
-  const int num_curves = b_hair.curves.length();
+  const int num_keys = b_curves.points.length();
+  const int num_curves = b_curves.curves.length();
 
   hair->reserve_curves(num_curves, num_keys);
 
+  std::optional<BL::FloatAttribute> b_attr_radius = find_curves_radius_attribute(b_curves);
+
   /* Export curves and points. */
   vector<float> points_length;
 
-  for (BL::HairCurve &b_curve : b_hair.curves) {
-    const int first_point_index = b_curve.first_point_index();
-    const int num_points = b_curve.num_points();
+  for (int i = 0; i < num_curves; i++) {
+    const int first_point_index = b_curves.curve_offset_data[i].value();
+    const int num_points = b_curves.curve_offset_data[i + 1].value() - first_point_index;
 
     float3 prev_co = zero_float3();
     float length = 0.0f;
@@ -687,10 +712,9 @@ static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair)
 
     /* Position and radius. */
     for (int i = 0; i < num_points; i++) {
-      BL::HairPoint b_point = b_hair.points[first_point_index + i];
-
-      const float3 co = get_float3(b_point.co());
-      const float radius = b_point.radius();
+      const float3 co = get_float3(b_curves.position_data[first_point_index + i].vector());
+      const float radius = b_attr_radius ? b_attr_radius->data[first_point_index + i].value() :
+                                           0.0f;
       hair->add_curve_key(co, radius);
 
       if (attr_intercept) {
@@ -715,7 +739,7 @@ static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair)
 
     /* Random number per curve. */
     if (attr_random != NULL) {
-      attr_random->add(hash_uint2_to_float(b_curve.index(), 0));
+      attr_random->add(hash_uint2_to_float(i, 0));
     }
 
     /* Curve. */
@@ -724,7 +748,7 @@ static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair)
   }
 }
 
-static void export_hair_curves_motion(Hair *hair, BL::Hair b_hair, int motion_step)
+static void export_hair_curves_motion(Hair *hair, BL::Curves b_curves, int motion_step)
 {
   /* Find or add attribute. */
   Attribute *attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
@@ -737,14 +761,17 @@ static void export_hair_curves_motion(Hair *hair, BL::Hair b_hair, int motion_st
 
   /* Export motion keys. */
   const int num_keys = hair->get_curve_keys().size();
+  const int num_curves = b_curves.curves.length();
   float4 *mP = attr_mP->data_float4() + motion_step * num_keys;
   bool have_motion = false;
   int num_motion_keys = 0;
   int curve_index = 0;
 
-  for (BL::HairCurve &b_curve : b_hair.curves) {
-    const int first_point_index = b_curve.first_point_index();
-    const int num_points = b_curve.num_points();
+  std::optional<BL::FloatAttribute> b_attr_radius = find_curves_radius_attribute(b_curves);
+
+  for (int i = 0; i < num_curves; i++) {
+    const int first_point_index = b_curves.curve_offset_data[i].value();
+    const int num_points = b_curves.curve_offset_data[i + 1].value() - first_point_index;
 
     Hair::Curve curve = hair->get_curve(curve_index);
     curve_index++;
@@ -755,7 +782,7 @@ static void export_hair_curves_motion(Hair *hair, BL::Hair b_hair, int motion_st
         int point_index = first_point_index + i;
 
         if (point_index < num_keys) {
-          mP[num_motion_keys] = hair_point_as_float4(b_hair.points[point_index]);
+          mP[num_motion_keys] = hair_point_as_float4(b_curves, b_attr_radius, point_index);
           num_motion_keys++;
 
           if (!have_motion) {
@@ -774,7 +801,8 @@ static void export_hair_curves_motion(Hair *hair, BL::Hair b_hair, int motion_st
       const float step_size = curve.num_keys > 1 ? 1.0f / (curve.num_keys - 1) : 0.0f;
       for (int i = 0; i < curve.num_keys; i++) {
         const float step = i * step_size;
-        mP[num_motion_keys] = interpolate_hair_points(b_hair, first_point_index, num_points, step);
+        mP[num_motion_keys] = interpolate_hair_points(
+            b_curves, b_attr_radius, first_point_index, num_points, step);
         num_motion_keys++;
       }
       have_motion = true;
@@ -791,12 +819,12 @@ static void export_hair_curves_motion(Hair *hair, BL::Hair b_hair, int motion_st
 void BlenderSync::sync_hair(Hair *hair, BObjectInfo &b_ob_info, bool motion, int motion_step)
 {
   /* Convert Blender hair to Cycles curves. */
-  BL::Hair b_hair(b_ob_info.object_data);
+  BL::Curves b_curves(b_ob_info.object_data);
   if (motion) {
-    export_hair_curves_motion(hair, b_hair, motion_step);
+    export_hair_curves_motion(hair, b_curves, motion_step);
   }
   else {
-    export_hair_curves(scene, hair, b_hair);
+    export_hair_curves(scene, hair, b_curves);
   }
 }
 #else
@@ -819,8 +847,8 @@ void BlenderSync::sync_hair(BL::Depsgraph b_depsgraph, BObjectInfo &b_ob_info, H
   new_hair.set_used_shaders(used_shaders);
 
   if (view_layer.use_hair) {
-#ifdef WITH_HAIR_NODES
-    if (b_ob_info.object_data.is_a(&RNA_Hair)) {
+#ifdef WITH_NEW_CURVES_TYPE
+    if (b_ob_info.object_data.is_a(&RNA_Curves)) {
       /* Hair object. */
       sync_hair(&new_hair, b_ob_info, false);
     }
@@ -873,8 +901,8 @@ void BlenderSync::sync_hair_motion(BL::Depsgraph b_depsgraph,
 
   /* Export deformed coordinates. */
   if (ccl::BKE_object_is_deform_modified(b_ob_info, b_scene, preview)) {
-#ifdef WITH_HAIR_NODES
-    if (b_ob_info.object_data.is_a(&RNA_Hair)) {
+#ifdef WITH_NEW_CURVES_TYPE
+    if (b_ob_info.object_data.is_a(&RNA_Curves)) {
       /* Hair object. */
       sync_hair(hair, b_ob_info, true, motion_step);
       return;
diff --git a/intern/cycles/blender/geometry.cpp b/intern/cycles/blender/geometry.cpp
index 78c803b7adb..a9b61f2578f 100644
--- a/intern/cycles/blender/geometry.cpp
+++ b/intern/cycles/blender/geometry.cpp
@@ -32,8 +32,8 @@ CCL_NAMESPACE_BEGIN
 
 static Geometry::Type determine_geom_type(BObjectInfo &b_ob_info, bool use_particle_hair)
 {
-#ifdef WITH_HAIR_NODES
-  if (b_ob_info.object_data.is_a(&RNA_Hair) || use_particle_hair) {
+#ifdef WITH_NEW_CURVES_TYPE
+  if (b_ob_info.object_data.is_a(&RNA_Curves) || use_particle_hair) {
 #else
   if (use_particle_hair) {
 #endif
@@ -231,8 +231,8 @@ void BlenderSync::sync_geometry_motion(BL::Depsgraph &b_depsgraph,
     if (progress.get_cancel())
       return;
 
-#ifdef WITH_HAIR_NODES
-    if (b_ob_info.object_data.is_a(&RNA_Hair) || use_particle_hair) {
+#ifdef WITH_NEW_CURVES_TYPE
+    if (b_ob_info.object_data.is_a(&RNA_Curves) || use_particle_hair) {
 #else
     if (use_particle_hair) {
 #endif
diff --git a/intern/cycles/blender/object.cpp b/intern/cycles/blender/object.cpp
index 65a04a39660..22acc09c538 100644
--- a/intern/cycles/blender/object.cpp
+++ b/intern/cycles/blender/object.cpp
@@ -72,7 +72,7 @@ bool BlenderSync::object_is_geometry(BObjectInfo &b_ob_info)
 
   BL::Object::type_enum type = b_ob_info.iter_object.type();
 
-  if (type == BL::Object::type_VOLUME || type == BL::Object::type_HAIR ||
+  if (type == BL::Object::type_VOLUME || type == BL::Object::type_CURVES ||
       type == BL::Object::type_POINTCLOUD) {
     /* Will be exported attached to mesh. */
     return true;
@@ -97,7 +97,7 @@ bool BlenderSync::object_can_have_geometry(BL::Object &b_ob)
     case BL::Object::type_SURFACE:
     case BL::Object::type_META:
     case BL::Object::type_FONT:
-    case BL::Object::type_HAIR:
+    case BL::Object::type_CURVES:
     case BL::Object::type_POINTCLOUD:
     case BL::Object::type_VOLUME:
       return true;
diff --git a/intern/cycles/blender/output_driver.cpp b/intern/cycles/blender/output_driver.cpp
index d5cc0c60bae..f35b48493cb 100644
--- a/intern/cycles/blender/output_driver.cpp
+++ b/intern/cycles/blender/output_driver.cpp
@@ -51,8 +51,6 @@ bool BlenderOutputDriver::read_render_tile(const Tile &tile)
 
   BL::RenderLayer b_rlay = *b_single_rlay;
 
-  vector<float> pixels(static_cast<size_t>(tile.size.x) * tile.size.y * 4);
-
   /* Copy each pass.
    * TODO:copy only the required ones for better performance? */
   for (BL::RenderPass &b_pass : b_rlay.passes) {
diff --git a/intern/cycles/blender/shader.cpp b/intern/cycles/blender/shader.cpp
index 5604c2989fd..39e49ac3478 100644
--- a/intern/cycles/blender/shader.cpp
+++ b/intern/cycles/blender/shader.cpp
@@ -689,6 +689,9 @@ static ShaderNode *add_node(Scene *scene,
   else if (b_node.is_a(&RNA_ShaderNodeHairInfo)) {
     node = graph->create_node<HairInfoNode>();
   }
+  else if (b_node.is_a(&RNA_ShaderNodePointInfo)) {
+    node = graph->create_node<PointInfoNode>();
+  }
   else if (b_node.is_a(&RNA_ShaderNodeVolumeInfo)) {
     node = graph->create_node<VolumeInfoNode>();
   }
diff --git a/intern/cycles/blender/sync.cpp b/intern/cycles/blender/sync.cpp
index 588e057b9ad..7e6f1535d66 100644
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -787,6 +787,7 @@ SceneParams BlenderSync::get_scene_params(BL::Scene &b_scene, bool background)
     params.bvh_type = BVH_TYPE_DYNAMIC;
 
   params.use_bvh_spatial_split = RNA_boolean_get(&cscene, "debug_use_spatial_splits");
+  params.use_bvh_compact_structure = RNA_boolean_get(&cscene, "debug_use_compact_bvh");
   params.use_bvh_unaligned_nodes = RNA_boolean_get(&cscene, "debug_use_hair_bvh");
   params.num_bvh_time_steps = RNA_int_get(&cscene, "debug_bvh_time_steps");
 
diff --git a/intern/cycles/bvh/embree.cpp b/intern/cycles/bvh/embree.cpp
index 618dd9438d5..616b6273e6a 100644
--- a/intern/cycles/bvh/embree.cpp
+++ b/intern/cycles/bvh/embree.cpp
@@ -66,6 +66,26 @@ static_assert(Object::MAX_MOTION_STEPS == Geometry::MAX_MOTION_STEPS,
  * as well as filtering for volume objects happen here.
  * Cycles' own BVH does that directly inside the traversal calls.
  */
+static void rtc_filter_intersection_func(const RTCFilterFunctionNArguments *args)
+{
+  /* Current implementation in Cycles assumes only single-ray intersection queries. */
+  assert(args->N == 1);
+
+  RTCHit *hit = (RTCHit *)args->hit;
+  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
+  const KernelGlobalsCPU *kg = ctx->kg;
+  const Ray *cray = ctx->ray;
+
+  if (kernel_embree_is_self_intersection(kg, hit, cray)) {
+    *args->valid = 0;
+  }
+}
+
+/* This gets called by Embree at every valid ray/object intersection.
+ * Things like recording subsurface or shadow hits for later evaluation
+ * as well as filtering for volume objects happen here.
+ * Cycles' own BVH does that directly inside the traversal calls.
+ */
 static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
 {
   /* Current implementation in Cycles assumes only single-ray intersection queries. */
@@ -75,12 +95,16 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
   RTCHit *hit = (RTCHit *)args->hit;
   CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
   const KernelGlobalsCPU *kg = ctx->kg;
+  const Ray *cray = ctx->ray;
 
   switch (ctx->type) {
     case CCLIntersectContext::RAY_SHADOW_ALL: {
       Intersection current_isect;
       kernel_embree_convert_hit(kg, ray, hit, &current_isect);
-
+      if (intersection_skip_self_shadow(cray->self, current_isect.object, current_isect.prim)) {
+        *args->valid = 0;
+        return;
+      }
       /* If no transparent shadows or max number of hits exceeded, all light is blocked. */
       const int flags = intersection_get_shader_flags(kg, current_isect.prim, current_isect.type);
       if (!(flags & (SD_HAS_TRANSPARENT_SHADOW)) || ctx->num_hits >= ctx->max_hits) {
@@ -160,6 +184,10 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
           break;
         }
       }
+      if (intersection_skip_self_local(cray->self, current_isect.prim)) {
+        *args->valid = 0;
+        return;
+      }
 
       /* No intersection information requested, just return a hit. */
       if (ctx->max_hits == 0) {
@@ -225,6 +253,11 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
       if (ctx->num_hits < ctx->max_hits) {
         Intersection current_isect;
         kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+        if (intersection_skip_self(cray->self, current_isect.object, current_isect.prim)) {
+          *args->valid = 0;
+          return;
+        }
+
         Intersection *isect = &ctx->isect_s[ctx->num_hits];
         ++ctx->num_hits;
         *isect = current_isect;
@@ -236,12 +269,15 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
         }
         /* This tells Embree to continue tracing. */
         *args->valid = 0;
-        break;
       }
+      break;
     }
     case CCLIntersectContext::RAY_REGULAR:
     default:
-      /* Nothing to do here. */
+      if (kernel_embree_is_self_intersection(kg, hit, cray)) {
+        *args->valid = 0;
+        return;
+      }
       break;
   }
 }
@@ -257,6 +293,14 @@ static void rtc_filter_func_backface_cull(const RTCFilterFunctionNArguments *arg
     *args->valid = 0;
     return;
   }
+
+  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
+  const KernelGlobalsCPU *kg = ctx->kg;
+  const Ray *cray = ctx->ray;
+
+  if (kernel_embree_is_self_intersection(kg, hit, cray)) {
+    *args->valid = 0;
+  }
 }
 
 static void rtc_filter_occluded_func_backface_cull(const RTCFilterFunctionNArguments *args)
@@ -355,10 +399,12 @@ void BVHEmbree::build(Progress &progress, Stats *stats, RTCDevice rtc_device_)
   }
 
   const bool dynamic = params.bvh_type == BVH_TYPE_DYNAMIC;
+  const bool compact = params.use_compact_structure;
 
   scene = rtcNewScene(rtc_device);
   const RTCSceneFlags scene_flags = (dynamic ? RTC_SCENE_FLAG_DYNAMIC : RTC_SCENE_FLAG_NONE) |
-                                    RTC_SCENE_FLAG_COMPACT | RTC_SCENE_FLAG_ROBUST;
+                                    (compact ? RTC_SCENE_FLAG_COMPACT : RTC_SCENE_FLAG_NONE) |
+                                    RTC_SCENE_FLAG_ROBUST;
   rtcSetSceneFlags(scene, scene_flags);
   build_quality = dynamic ? RTC_BUILD_QUALITY_LOW :
                             (params.use_spatial_split ? RTC_BUILD_QUALITY_HIGH :
@@ -503,6 +549,7 @@ void BVHEmbree::add_triangles(const Object *ob, const Mesh *mesh, int i)
 
   rtcSetGeometryUserData(geom_id, (void *)prim_offset);
   rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func);
+  rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_intersection_func);
   rtcSetGeometryMask(geom_id, ob->visibility_for_tracing());
 
   rtcCommitGeometry(geom_id);
@@ -765,6 +812,7 @@ void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i)
 
   rtcSetGeometryUserData(geom_id, (void *)prim_offset);
   if (hair->curve_shape == CURVE_RIBBON) {
+    rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_intersection_func);
     rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func);
   }
   else {
diff --git a/intern/cycles/bvh/params.h b/intern/cycles/bvh/params.h
index 16edf2e88e4..61fa5484ce0 100644
--- a/intern/cycles/bvh/params.h
+++ b/intern/cycles/bvh/params.h
@@ -97,6 +97,9 @@ class BVHParams {
    */
   bool use_unaligned_nodes;
 
+  /* Use compact acceleration structure (Embree)*/
+  bool use_compact_structure;
+
   /* Split time range to this number of steps and create leaf node for each
    * of this time steps.
    *
@@ -139,6 +142,7 @@ class BVHParams {
 
     top_level = false;
     bvh_layout = BVH_LAYOUT_BVH2;
+    use_compact_structure = true;
     use_unaligned_nodes = false;
 
     num_motion_curve_steps = 0;
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index f46d18a4926..8d9631e5b44 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -559,10 +559,10 @@ if(WITH_CYCLES_DEVICE_METAL)
   find_library(METAL_LIBRARY Metal)
 
   # This file was added in the 12.0 SDK, use it as a way to detect the version.
-  if (METAL_LIBRARY AND NOT EXISTS "${METAL_LIBRARY}/Headers/MTLFunctionStitching.h")
+  if(METAL_LIBRARY AND NOT EXISTS "${METAL_LIBRARY}/Headers/MTLFunctionStitching.h")
     message(STATUS "Metal version too old, must be SDK 12.0 or newer, disabling WITH_CYCLES_DEVICE_METAL")
     set(WITH_CYCLES_DEVICE_METAL OFF)
-  elseif (NOT METAL_LIBRARY)
+  elseif(NOT METAL_LIBRARY)
     message(STATUS "Metal not found, disabling WITH_CYCLES_DEVICE_METAL")
     set(WITH_CYCLES_DEVICE_METAL OFF)
   else()
diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp
index 4f1cbabc89b..85ed3dc5b55 100644
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -905,8 +905,8 @@ void HIPDevice::tex_alloc(device_texture &mem)
       address_mode = hipAddressModeClamp;
       break;
     case EXTENSION_CLIP:
-      // TODO : (Arya) setting this to Mode Clamp instead of Mode Border because it's unsupported
-      // in hip
+      /* TODO(@arya): setting this to Mode Clamp instead of Mode Border
+       * because it's unsupported in HIP. */
       address_mode = hipAddressModeClamp;
       break;
     default:
diff --git a/intern/cycles/device/metal/bvh.h b/intern/cycles/device/metal/bvh.h
index cbc5ca7d2c3..58d71e3928f 100644
--- a/intern/cycles/device/metal/bvh.h
+++ b/intern/cycles/device/metal/bvh.h
@@ -58,6 +58,11 @@ class BVHMetal : public BVH {
                        id<MTLCommandQueue> queue,
                        Geometry *const geom,
                        bool refit);
+  bool build_BLAS_pointcloud(Progress &progress,
+                             id<MTLDevice> device,
+                             id<MTLCommandQueue> queue,
+                             Geometry *const geom,
+                             bool refit);
   bool build_TLAS(Progress &progress, id<MTLDevice> device, id<MTLCommandQueue> queue, bool refit);
 };
 
diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm
index 1953102cb41..8b252f1a5ec 100644
--- a/intern/cycles/device/metal/bvh.mm
+++ b/intern/cycles/device/metal/bvh.mm
@@ -19,6 +19,7 @@
 #  include "scene/hair.h"
 #  include "scene/mesh.h"
 #  include "scene/object.h"
+#  include "scene/pointcloud.h"
 
 #  include "util/progress.h"
 
@@ -475,6 +476,220 @@ bool BVHMetal::build_BLAS_hair(Progress &progress,
   return false;
 }
 
+bool BVHMetal::build_BLAS_pointcloud(Progress &progress,
+                                     id<MTLDevice> device,
+                                     id<MTLCommandQueue> queue,
+                                     Geometry *const geom,
+                                     bool refit)
+{
+  if (@available(macos 12.0, *)) {
+    /* Build BLAS for point cloud */
+    PointCloud *pointcloud = static_cast<PointCloud *>(geom);
+    if (pointcloud->num_points() == 0) {
+      return false;
+    }
+
+    /*------------------------------------------------*/
+    BVH_status("Building pointcloud BLAS | %7d points | %s",
+               (int)pointcloud->num_points(),
+               geom->name.c_str());
+    /*------------------------------------------------*/
+
+    const size_t num_points = pointcloud->get_points().size();
+    const float3 *points = pointcloud->get_points().data();
+    const float *radius = pointcloud->get_radius().data();
+
+    const bool use_fast_trace_bvh = (params.bvh_type == BVH_TYPE_STATIC);
+
+    size_t num_motion_steps = 1;
+    Attribute *motion_keys = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+    if (motion_blur && pointcloud->get_use_motion_blur() && motion_keys) {
+      num_motion_steps = pointcloud->get_motion_steps();
+    }
+
+    const size_t num_aabbs = num_motion_steps;
+
+    MTLResourceOptions storage_mode;
+    if (device.hasUnifiedMemory) {
+      storage_mode = MTLResourceStorageModeShared;
+    }
+    else {
+      storage_mode = MTLResourceStorageModeManaged;
+    }
+
+    /* Allocate a GPU buffer for the AABB data and populate it */
+    id<MTLBuffer> aabbBuf = [device
+        newBufferWithLength:num_aabbs * sizeof(MTLAxisAlignedBoundingBox)
+                    options:storage_mode];
+    MTLAxisAlignedBoundingBox *aabb_data = (MTLAxisAlignedBoundingBox *)[aabbBuf contents];
+
+    /* Get AABBs for each motion step */
+    size_t center_step = (num_motion_steps - 1) / 2;
+    for (size_t step = 0; step < num_motion_steps; ++step) {
+      /* The center step for motion vertices is not stored in the attribute */
+      if (step != center_step) {
+        size_t attr_offset = (step > center_step) ? step - 1 : step;
+        points = motion_keys->data_float3() + attr_offset * num_points;
+      }
+
+      for (size_t j = 0; j < num_points; ++j) {
+        const PointCloud::Point point = pointcloud->get_point(j);
+        BoundBox bounds = BoundBox::empty;
+        point.bounds_grow(points, radius, bounds);
+
+        const size_t index = step * num_points + j;
+        aabb_data[index].min = (MTLPackedFloat3 &)bounds.min;
+        aabb_data[index].max = (MTLPackedFloat3 &)bounds.max;
+      }
+    }
+
+    if (storage_mode == MTLResourceStorageModeManaged) {
+      [aabbBuf didModifyRange:NSMakeRange(0, aabbBuf.length)];
+    }
+
+#  if 0
+    for (size_t i=0; i<num_aabbs && i < 400; i++) {
+      MTLAxisAlignedBoundingBox& bb = aabb_data[i];
+      printf("  %d:   %.1f,%.1f,%.1f -- %.1f,%.1f,%.1f\n", int(i), bb.min.x, bb.min.y, bb.min.z, bb.max.x, bb.max.y, bb.max.z);
+    }
+#  endif
+
+    MTLAccelerationStructureGeometryDescriptor *geomDesc;
+    if (motion_blur) {
+      std::vector<MTLMotionKeyframeData *> aabb_ptrs;
+      aabb_ptrs.reserve(num_motion_steps);
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        MTLMotionKeyframeData *k = [MTLMotionKeyframeData data];
+        k.buffer = aabbBuf;
+        k.offset = step * num_points * sizeof(MTLAxisAlignedBoundingBox);
+        aabb_ptrs.push_back(k);
+      }
+
+      MTLAccelerationStructureMotionBoundingBoxGeometryDescriptor *geomDescMotion =
+          [MTLAccelerationStructureMotionBoundingBoxGeometryDescriptor descriptor];
+      geomDescMotion.boundingBoxBuffers = [NSArray arrayWithObjects:aabb_ptrs.data()
+                                                              count:aabb_ptrs.size()];
+      geomDescMotion.boundingBoxCount = num_points;
+      geomDescMotion.boundingBoxStride = sizeof(aabb_data[0]);
+      geomDescMotion.intersectionFunctionTableOffset = 2;
+
+      /* Force a single any-hit call, so shadow record-all behavior works correctly */
+      /* (Match optix behavior: unsigned int build_flags =
+       * OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;) */
+      geomDescMotion.allowDuplicateIntersectionFunctionInvocation = false;
+      geomDescMotion.opaque = true;
+      geomDesc = geomDescMotion;
+    }
+    else {
+      MTLAccelerationStructureBoundingBoxGeometryDescriptor *geomDescNoMotion =
+          [MTLAccelerationStructureBoundingBoxGeometryDescriptor descriptor];
+      geomDescNoMotion.boundingBoxBuffer = aabbBuf;
+      geomDescNoMotion.boundingBoxBufferOffset = 0;
+      geomDescNoMotion.boundingBoxCount = int(num_aabbs);
+      geomDescNoMotion.boundingBoxStride = sizeof(aabb_data[0]);
+      geomDescNoMotion.intersectionFunctionTableOffset = 2;
+
+      /* Force a single any-hit call, so shadow record-all behavior works correctly */
+      /* (Match optix behavior: unsigned int build_flags =
+       * OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;) */
+      geomDescNoMotion.allowDuplicateIntersectionFunctionInvocation = false;
+      geomDescNoMotion.opaque = true;
+      geomDesc = geomDescNoMotion;
+    }
+
+    MTLPrimitiveAccelerationStructureDescriptor *accelDesc =
+        [MTLPrimitiveAccelerationStructureDescriptor descriptor];
+    accelDesc.geometryDescriptors = @[ geomDesc ];
+
+    if (motion_blur) {
+      accelDesc.motionStartTime = 0.0f;
+      accelDesc.motionEndTime = 1.0f;
+      accelDesc.motionStartBorderMode = MTLMotionBorderModeVanish;
+      accelDesc.motionEndBorderMode = MTLMotionBorderModeVanish;
+      accelDesc.motionKeyframeCount = num_motion_steps;
+    }
+
+    if (!use_fast_trace_bvh) {
+      accelDesc.usage |= (MTLAccelerationStructureUsageRefit |
+                          MTLAccelerationStructureUsagePreferFastBuild);
+    }
+
+    MTLAccelerationStructureSizes accelSizes = [device
+        accelerationStructureSizesWithDescriptor:accelDesc];
+    id<MTLAccelerationStructure> accel_uncompressed = [device
+        newAccelerationStructureWithSize:accelSizes.accelerationStructureSize];
+    id<MTLBuffer> scratchBuf = [device newBufferWithLength:accelSizes.buildScratchBufferSize
+                                                   options:MTLResourceStorageModePrivate];
+    id<MTLBuffer> sizeBuf = [device newBufferWithLength:8 options:MTLResourceStorageModeShared];
+    id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
+    id<MTLAccelerationStructureCommandEncoder> accelEnc =
+        [accelCommands accelerationStructureCommandEncoder];
+    if (refit) {
+      [accelEnc refitAccelerationStructure:accel_struct
+                                descriptor:accelDesc
+                               destination:accel_uncompressed
+                             scratchBuffer:scratchBuf
+                       scratchBufferOffset:0];
+    }
+    else {
+      [accelEnc buildAccelerationStructure:accel_uncompressed
+                                descriptor:accelDesc
+                             scratchBuffer:scratchBuf
+                       scratchBufferOffset:0];
+    }
+    if (use_fast_trace_bvh) {
+      [accelEnc writeCompactedAccelerationStructureSize:accel_uncompressed
+                                               toBuffer:sizeBuf
+                                                 offset:0
+                                           sizeDataType:MTLDataTypeULong];
+    }
+    [accelEnc endEncoding];
+    [accelCommands addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
+      /* free temp resources */
+      [scratchBuf release];
+      [aabbBuf release];
+
+      if (use_fast_trace_bvh) {
+        /* Compact the accel structure */
+        uint64_t compressed_size = *(uint64_t *)sizeBuf.contents;
+
+        dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+          id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
+          id<MTLAccelerationStructureCommandEncoder> accelEnc =
+              [accelCommands accelerationStructureCommandEncoder];
+          id<MTLAccelerationStructure> accel = [device
+              newAccelerationStructureWithSize:compressed_size];
+          [accelEnc copyAndCompactAccelerationStructure:accel_uncompressed
+                                toAccelerationStructure:accel];
+          [accelEnc endEncoding];
+          [accelCommands addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
+            uint64_t allocated_size = [accel allocatedSize];
+            stats.mem_alloc(allocated_size);
+            accel_struct = accel;
+            [accel_uncompressed release];
+            accel_struct_building = false;
+          }];
+          [accelCommands commit];
+        });
+      }
+      else {
+        /* set our acceleration structure to the uncompressed structure */
+        accel_struct = accel_uncompressed;
+
+        uint64_t allocated_size = [accel_struct allocatedSize];
+        stats.mem_alloc(allocated_size);
+        accel_struct_building = false;
+      }
+      [sizeBuf release];
+    }];
+
+    accel_struct_building = true;
+    [accelCommands commit];
+    return true;
+  }
+  return false;
+}
+
 bool BVHMetal::build_BLAS(Progress &progress,
                           id<MTLDevice> device,
                           id<MTLCommandQueue> queue,
@@ -491,6 +706,8 @@ bool BVHMetal::build_BLAS(Progress &progress,
         return build_BLAS_mesh(progress, device, queue, geom, refit);
       case Geometry::HAIR:
         return build_BLAS_hair(progress, device, queue, geom, refit);
+      case Geometry::POINTCLOUD:
+        return build_BLAS_pointcloud(progress, device, queue, geom, refit);
       default:
         return false;
     }
diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h
index 4874af1bfa6..a4bfb30436d 100644
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -36,6 +36,8 @@ enum {
   METALRT_FUNC_CURVE_RIBBON_SHADOW,
   METALRT_FUNC_CURVE_ALL,
   METALRT_FUNC_CURVE_ALL_SHADOW,
+  METALRT_FUNC_POINT,
+  METALRT_FUNC_POINT_SHADOW,
   METALRT_FUNC_NUM
 };
 
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index f948a8a0a0f..e9bd1cea5df 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -358,6 +358,8 @@ bool MetalDeviceKernels::load(MetalDevice *device, int kernel_type)
           "__intersection__curve_ribbon_shadow",
           "__intersection__curve_all",
           "__intersection__curve_all_shadow",
+          "__intersection__point",
+          "__intersection__point_shadow",
       };
       assert(sizeof(function_names) / sizeof(function_names[0]) == METALRT_FUNC_NUM);
 
@@ -400,37 +402,51 @@ bool MetalDeviceKernels::load(MetalDevice *device, int kernel_type)
     NSArray *function_list = nil;
 
     if (device->use_metalrt) {
-      id<MTLFunction> box_intersect_default = nil;
-      id<MTLFunction> box_intersect_shadow = nil;
+      id<MTLFunction> curve_intersect_default = nil;
+      id<MTLFunction> curve_intersect_shadow = nil;
+      id<MTLFunction> point_intersect_default = nil;
+      id<MTLFunction> point_intersect_shadow = nil;
       if (device->kernel_features & KERNEL_FEATURE_HAIR) {
         /* Add curve intersection programs. */
         if (device->kernel_features & KERNEL_FEATURE_HAIR_THICK) {
           /* Slower programs for thick hair since that also slows down ribbons.
            * Ideally this should not be needed. */
-          box_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL];
-          box_intersect_shadow = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL_SHADOW];
+          curve_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL];
+          curve_intersect_shadow =
+              rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL_SHADOW];
         }
         else {
-          box_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON];
-          box_intersect_shadow =
+          curve_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON];
+          curve_intersect_shadow =
               rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON_SHADOW];
         }
       }
+      if (device->kernel_features & KERNEL_FEATURE_POINTCLOUD) {
+        point_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_POINT];
+        point_intersect_shadow = rt_intersection_funcs[kernel_type][METALRT_FUNC_POINT_SHADOW];
+      }
       table_functions[METALRT_TABLE_DEFAULT] = [NSArray
           arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_TRI],
-                           box_intersect_default ?
-                               box_intersect_default :
+                           curve_intersect_default ?
+                               curve_intersect_default :
+                               rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX],
+                           point_intersect_default ?
+                               point_intersect_default :
                                rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX],
                            nil];
       table_functions[METALRT_TABLE_SHADOW] = [NSArray
           arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_TRI],
-                           box_intersect_shadow ?
-                               box_intersect_shadow :
+                           curve_intersect_shadow ?
+                               curve_intersect_shadow :
+                               rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX],
+                           point_intersect_shadow ?
+                               point_intersect_shadow :
                                rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX],
                            nil];
       table_functions[METALRT_TABLE_LOCAL] = [NSArray
           arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_TRI],
                            rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX],
+                           rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX],
                            nil];
 
       NSMutableSet *unique_functions = [NSMutableSet
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index 009661b2dec..cb6c36d5ea6 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -226,7 +226,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   pipeline_options.usesMotionBlur = false;
   pipeline_options.traversableGraphFlags =
       OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
-  pipeline_options.numPayloadValues = 6;
+  pipeline_options.numPayloadValues = 8;
   pipeline_options.numAttributeValues = 2; /* u, v */
   pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
   pipeline_options.pipelineLaunchParamsVariableName = "__params"; /* See globals.h */
diff --git a/intern/cycles/integrator/denoiser.cpp b/intern/cycles/integrator/denoiser.cpp
index 2a5f99f358b..28cdeeb630a 100644
--- a/intern/cycles/integrator/denoiser.cpp
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -125,20 +125,41 @@ static Device *find_best_device(Device *device, DenoiserType type)
   return best_device;
 }
 
+static DeviceInfo find_best_denoiser_device_info(const vector<DeviceInfo> &device_infos,
+                                                 DenoiserType denoiser_type)
+{
+  for (const DeviceInfo &device_info : device_infos) {
+    if ((device_info.denoisers & denoiser_type) == 0) {
+      continue;
+    }
+
+    /* TODO(sergey): Use one of the already configured devices, so that OptiX denoising can happen
+     * on a physical CUDA device which is already used for rendering. */
+
+    /* TODO(sergey): Choose fastest device for denoising. */
+
+    return device_info;
+  }
+
+  DeviceInfo none_device;
+  none_device.type = DEVICE_NONE;
+  return none_device;
+}
+
 static unique_ptr<Device> create_denoiser_device(Device *path_trace_device,
-                                                 const uint device_type_mask)
+                                                 const uint device_type_mask,
+                                                 DenoiserType denoiser_type)
 {
   const vector<DeviceInfo> device_infos = Device::available_devices(device_type_mask);
   if (device_infos.empty()) {
     return nullptr;
   }
 
-  /* TODO(sergey): Use one of the already configured devices, so that OptiX denoising can happen on
-   * a physical CUDA device which is already used for rendering. */
-
-  /* TODO(sergey): Choose fastest device for denoising. */
-
-  const DeviceInfo denoiser_device_info = device_infos.front();
+  const DeviceInfo denoiser_device_info = find_best_denoiser_device_info(device_infos,
+                                                                         denoiser_type);
+  if (denoiser_device_info.type == DEVICE_NONE) {
+    return nullptr;
+  }
 
   unique_ptr<Device> denoiser_device(
       Device::create(denoiser_device_info, path_trace_device->stats, path_trace_device->profiler));
@@ -186,7 +207,8 @@ Device *Denoiser::ensure_denoiser_device(Progress *progress)
   device_creation_attempted_ = true;
 
   const uint device_type_mask = get_device_type_mask();
-  local_denoiser_device_ = create_denoiser_device(path_trace_device_, device_type_mask);
+  local_denoiser_device_ = create_denoiser_device(
+      path_trace_device_, device_type_mask, params_.type);
   denoiser_device_ = local_denoiser_device_.get();
 
   return denoiser_device_;
diff --git a/intern/cycles/integrator/denoiser_oidn.cpp b/intern/cycles/integrator/denoiser_oidn.cpp
index a08aec513fc..4676e69c4fb 100644
--- a/intern/cycles/integrator/denoiser_oidn.cpp
+++ b/intern/cycles/integrator/denoiser_oidn.cpp
@@ -37,8 +37,6 @@ OIDNDenoiser::OIDNDenoiser(Device *path_trace_device, const DenoiseParams &param
     : Denoiser(path_trace_device, params)
 {
   DCHECK_EQ(params.type, DENOISER_OPENIMAGEDENOISE);
-
-  DCHECK(openimagedenoise_supported()) << "OpenImageDenoiser is not supported on this platform.";
 }
 
 #ifdef WITH_OPENIMAGEDENOISE
@@ -585,6 +583,9 @@ bool OIDNDenoiser::denoise_buffer(const BufferParams &buffer_params,
                                   const int num_samples,
                                   bool allow_inplace_modification)
 {
+  DCHECK(openimagedenoise_supported())
+      << "OpenImageDenoiser is not supported on this platform or build.";
+
 #ifdef WITH_OPENIMAGEDENOISE
   thread_scoped_lock lock(mutex_);
 
@@ -635,4 +636,20 @@ uint OIDNDenoiser::get_device_type_mask() const
   return DEVICE_MASK_CPU;
 }
 
+Device *OIDNDenoiser::ensure_denoiser_device(Progress *progress)
+{
+#ifndef WITH_OPENIMAGEDENOISE
+  path_trace_device_->set_error("Build without OpenImageDenoiser");
+  return nullptr;
+#else
+  if (!openimagedenoise_supported()) {
+    path_trace_device_->set_error(
+        "OpenImageDenoiser is not supported on this CPU: missing SSE 4.1 support");
+    return nullptr;
+  }
+
+  return Denoiser::ensure_denoiser_device(progress);
+#endif
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_oidn.h b/intern/cycles/integrator/denoiser_oidn.h
index a0ec3e26b9c..2b815be973e 100644
--- a/intern/cycles/integrator/denoiser_oidn.h
+++ b/intern/cycles/integrator/denoiser_oidn.h
@@ -38,6 +38,7 @@ class OIDNDenoiser : public Denoiser {
 
  protected:
   virtual uint get_device_type_mask() const override;
+  virtual Device *ensure_denoiser_device(Progress *progress) override;
 
   /* We only perform one denoising at a time, since OpenImageDenoise itself is multithreaded.
    * Use this mutex whenever images are passed to the OIDN and needs to be denoised. */
diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp
index 4479442df56..9fa5aab9ea9 100644
--- a/intern/cycles/integrator/pass_accessor.cpp
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -141,6 +141,7 @@ bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
   const PassType type = pass_access_info_.type;
   const PassMode mode = pass_access_info_.mode;
   const PassInfo pass_info = Pass::get_info(type, pass_access_info_.include_albedo);
+  int num_written_components = pass_info.num_components;
 
   if (pass_info.num_components == 1) {
     /* Single channel passes. */
@@ -188,8 +189,10 @@ bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
     else if ((pass_info.divide_type != PASS_NONE || pass_info.direct_type != PASS_NONE ||
               pass_info.indirect_type != PASS_NONE) &&
              mode != PassMode::DENOISED) {
-      /* RGB lighting passes that need to divide out color and/or sum direct and indirect. */
+      /* RGB lighting passes that need to divide out color and/or sum direct and indirect.
+       * These can also optionally write alpha like the combined pass. */
       get_pass_light_path(render_buffers, buffer_params, destination);
+      num_written_components = 4;
     }
     else {
       /* Passes that need no special computation, or denoised passes that already
@@ -215,7 +218,7 @@ bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
     }
   }
 
-  pad_pixels(buffer_params, destination, pass_info.num_components);
+  pad_pixels(buffer_params, destination, num_written_components);
 
   return true;
 }
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index 0b55d1078a8..fd697836f52 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -820,8 +820,15 @@ void PathTrace::tile_buffer_read()
     return;
   }
 
+  /* Read buffers back from device. */
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    path_trace_work->copy_render_buffers_from_device();
+  });
+
+  /* Read (subset of) passes from output driver. */
   PathTraceTile tile(*this);
   if (output_driver_->read_render_tile(tile)) {
+    /* Copy buffers to device again. */
     tbb::parallel_for_each(path_trace_works_, [](unique_ptr<PathTraceWork> &path_trace_work) {
       path_trace_work->copy_render_buffers_to_device();
     });
diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp
index 95a1adeb016..0edd3810c39 100644
--- a/intern/cycles/integrator/shader_eval.cpp
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -157,7 +157,7 @@ bool ShaderEval::eval_gpu(Device *device,
   queue->init_execution();
 
   /* Execute work on GPU in chunk, so we can cancel.
-   * TODO : query appropriate size from device.*/
+   * TODO: query appropriate size from device. */
   const int32_t chunk_size = 65536;
 
   device_ptr d_input = input.device_pointer;
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 67804fb1d0d..1797bf60720 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -173,15 +173,16 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg,
   uint p3 = 0;
   uint p4 = visibility;
   uint p5 = PRIMITIVE_NONE;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
 
   uint ray_mask = visibility & 0xFF;
-  uint ray_flags = OPTIX_RAY_FLAG_NONE;
+  uint ray_flags = OPTIX_RAY_FLAG_ENFORCE_ANYHIT;
   if (0 == ray_mask && (visibility & ~0xFF) != 0) {
     ray_mask = 0xFF;
-    ray_flags = OPTIX_RAY_FLAG_ENFORCE_ANYHIT;
   }
   else if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-    ray_flags = OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT;
+    ray_flags |= OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT;
   }
 
   optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
@@ -200,7 +201,9 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg,
              p2,
              p3,
              p4,
-             p5);
+             p5,
+             p6,
+             p7);
 
   isect->t = __uint_as_float(p0);
   isect->u = __uint_as_float(p1);
@@ -242,6 +245,7 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg,
   }
 
   MetalRTIntersectionPayload payload;
+  payload.self = ray->self;
   payload.u = 0.0f;
   payload.v = 0.0f;
   payload.visibility = visibility;
@@ -309,6 +313,7 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg,
     CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR);
     IntersectContext rtc_ctx(&ctx);
     RTCRayHit ray_hit;
+    ctx.ray = ray;
     kernel_embree_setup_rayhit(*ray, ray_hit, visibility);
     rtcIntersect1(kernel_data.bvh.scene, &rtc_ctx.context, &ray_hit);
     if (ray_hit.hit.geomID != RTC_INVALID_GEOMETRY_ID &&
@@ -356,6 +361,9 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
   uint p2 = pointer_pack_to_uint_0(local_isect);
   uint p3 = pointer_pack_to_uint_1(local_isect);
   uint p4 = local_object;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
+
   /* Is set to zero on miss or if ray is aborted, so can be used as return value. */
   uint p5 = max_hits;
 
@@ -379,7 +387,9 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
              p2,
              p3,
              p4,
-             p5);
+             p5,
+             p6,
+             p7);
 
   return p5;
 #  elif defined(__METALRT__)
@@ -417,6 +427,7 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
   }
 
   MetalRTIntersectionLocalPayload payload;
+  payload.self = ray->self;
   payload.local_object = local_object;
   payload.max_hits = max_hits;
   payload.local_isect.num_hits = 0;
@@ -460,6 +471,7 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
         kg, has_bvh ? CCLIntersectContext::RAY_SSS : CCLIntersectContext::RAY_LOCAL);
     ctx.lcg_state = lcg_state;
     ctx.max_hits = max_hits;
+    ctx.ray = ray;
     ctx.local_isect = local_isect;
     if (local_isect) {
       local_isect->num_hits = 0;
@@ -532,6 +544,8 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
   uint p3 = max_hits;
   uint p4 = visibility;
   uint p5 = false;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
 
   uint ray_mask = visibility & 0xFF;
   if (0 == ray_mask && (visibility & ~0xFF) != 0) {
@@ -555,7 +569,9 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
              p2,
              p3,
              p4,
-             p5);
+             p5,
+             p6,
+             p7);
 
   *num_recorded_hits = uint16_unpack_from_uint_0(p2);
   *throughput = __uint_as_float(p1);
@@ -588,6 +604,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
   }
 
   MetalRTIntersectionShadowPayload payload;
+  payload.self = ray->self;
   payload.visibility = visibility;
   payload.max_hits = max_hits;
   payload.num_hits = 0;
@@ -634,6 +651,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
     Intersection *isect_array = (Intersection *)state->shadow_isect;
     ctx.isect_s = isect_array;
     ctx.max_hits = max_hits;
+    ctx.ray = ray;
     IntersectContext rtc_ctx(&ctx);
     RTCRay rtc_ray;
     kernel_embree_setup_ray(*ray, rtc_ray, visibility);
@@ -685,6 +703,8 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
   uint p3 = 0;
   uint p4 = visibility;
   uint p5 = PRIMITIVE_NONE;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
 
   uint ray_mask = visibility & 0xFF;
   if (0 == ray_mask && (visibility & ~0xFF) != 0) {
@@ -708,7 +728,9 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
              p2,
              p3,
              p4,
-             p5);
+             p5,
+             p6,
+             p7);
 
   isect->t = __uint_as_float(p0);
   isect->u = __uint_as_float(p1);
@@ -744,6 +766,7 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
   }
 
   MetalRTIntersectionPayload payload;
+  payload.self = ray->self;
   payload.visibility = visibility;
 
   typename metalrt_intersector_type::result_type intersection;
@@ -820,6 +843,7 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals kg,
     ctx.isect_s = isect;
     ctx.max_hits = max_hits;
     ctx.num_hits = 0;
+    ctx.ray = ray;
     IntersectContext rtc_ctx(&ctx);
     RTCRay rtc_ray;
     kernel_embree_setup_ray(*ray, rtc_ray, visibility);
diff --git a/intern/cycles/kernel/bvh/embree.h b/intern/cycles/kernel/bvh/embree.h
index 9edd4f90a7e..19c4b9f6f3d 100644
--- a/intern/cycles/kernel/bvh/embree.h
+++ b/intern/cycles/kernel/bvh/embree.h
@@ -22,6 +22,8 @@
 #include "kernel/device/cpu/compat.h"
 #include "kernel/device/cpu/globals.h"
 
+#include "kernel/bvh/util.h"
+
 #include "util/vector.h"
 
 CCL_NAMESPACE_BEGIN
@@ -38,6 +40,9 @@ struct CCLIntersectContext {
   KernelGlobals kg;
   RayType type;
 
+  /* For avoiding self intersections */
+  const Ray *ray;
+
   /* for shadow rays */
   Intersection *isect_s;
   uint max_hits;
@@ -56,6 +61,7 @@ struct CCLIntersectContext {
   {
     kg = kg_;
     type = type_;
+    ray = NULL;
     max_hits = 1;
     num_hits = 0;
     num_recorded_hits = 0;
@@ -102,7 +108,34 @@ ccl_device_inline void kernel_embree_setup_rayhit(const Ray &ray,
 {
   kernel_embree_setup_ray(ray, rayhit.ray, visibility);
   rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-  rayhit.hit.primID = RTC_INVALID_GEOMETRY_ID;
+  rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
+}
+
+ccl_device_inline bool kernel_embree_is_self_intersection(const KernelGlobals kg,
+                                                          const RTCHit *hit,
+                                                          const Ray *ray)
+{
+  bool status = false;
+  if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) {
+    const int oID = hit->instID[0] / 2;
+    if ((ray->self.object == oID) || (ray->self.light_object == oID)) {
+      RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
+          rtcGetGeometry(kernel_data.bvh.scene, hit->instID[0]));
+      const int pID = hit->primID +
+                      (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
+      status = intersection_skip_self_shadow(ray->self, oID, pID);
+    }
+  }
+  else {
+    const int oID = hit->geomID / 2;
+    if ((ray->self.object == oID) || (ray->self.light_object == oID)) {
+      const int pID = hit->primID + (intptr_t)rtcGetGeometryUserData(
+                                        rtcGetGeometry(kernel_data.bvh.scene, hit->geomID));
+      status = intersection_skip_self_shadow(ray->self, oID, pID);
+    }
+  }
+
+  return status;
 }
 
 ccl_device_inline void kernel_embree_convert_hit(KernelGlobals kg,
diff --git a/intern/cycles/kernel/bvh/local.h b/intern/cycles/kernel/bvh/local.h
index 4d0e6aac901..4ef6deef98d 100644
--- a/intern/cycles/kernel/bvh/local.h
+++ b/intern/cycles/kernel/bvh/local.h
@@ -157,7 +157,11 @@ ccl_device_inline
                 }
               }
 
+              /* Skip self intersection. */
               const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+              if (intersection_skip_self_local(ray->self, prim)) {
+                continue;
+              }
 
               if (triangle_intersect_local(kg,
                                            local_isect,
@@ -188,7 +192,11 @@ ccl_device_inline
                 }
               }
 
+              /* Skip self intersection. */
               const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+              if (intersection_skip_self_local(ray->self, prim)) {
+                continue;
+              }
 
               if (motion_triangle_intersect_local(kg,
                                                   local_isect,
diff --git a/intern/cycles/kernel/bvh/metal.h b/intern/cycles/kernel/bvh/metal.h
index 55456d15f50..5ab413d9314 100644
--- a/intern/cycles/kernel/bvh/metal.h
+++ b/intern/cycles/kernel/bvh/metal.h
@@ -15,6 +15,7 @@
  */
 
 struct MetalRTIntersectionPayload {
+  RaySelfPrimitives self;
   uint visibility;
   float u, v;
   int prim;
@@ -25,6 +26,7 @@ struct MetalRTIntersectionPayload {
 };
 
 struct MetalRTIntersectionLocalPayload {
+  RaySelfPrimitives self;
   uint local_object;
   uint lcg_state;
   short max_hits;
@@ -34,6 +36,7 @@ struct MetalRTIntersectionLocalPayload {
 };
 
 struct MetalRTIntersectionShadowPayload {
+  RaySelfPrimitives self;
   uint visibility;
 #if defined(__METALRT_MOTION__)
   float time;
diff --git a/intern/cycles/kernel/bvh/shadow_all.h b/intern/cycles/kernel/bvh/shadow_all.h
index 0fb86bfda77..59a7ba63045 100644
--- a/intern/cycles/kernel/bvh/shadow_all.h
+++ b/intern/cycles/kernel/bvh/shadow_all.h
@@ -160,6 +160,9 @@ ccl_device_inline
                                         kernel_tex_fetch(__prim_object, prim_addr) :
                                         object;
             const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+            if (intersection_skip_self_shadow(ray->self, prim_object, prim)) {
+              continue;
+            }
 
             switch (type & PRIMITIVE_ALL) {
               case PRIMITIVE_TRIANGLE: {
diff --git a/intern/cycles/kernel/bvh/traversal.h b/intern/cycles/kernel/bvh/traversal.h
index dc2d1422df6..17cd357a069 100644
--- a/intern/cycles/kernel/bvh/traversal.h
+++ b/intern/cycles/kernel/bvh/traversal.h
@@ -133,35 +133,29 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
           --stack_ptr;
 
           /* primitive intersection */
-          switch (type & PRIMITIVE_ALL) {
-            case PRIMITIVE_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-
-                const int prim_object = (object == OBJECT_NONE) ?
-                                            kernel_tex_fetch(__prim_object, prim_addr) :
-                                            object;
-                const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+          for (; prim_addr < prim_addr2; prim_addr++) {
+            kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+
+            const int prim_object = (object == OBJECT_NONE) ?
+                                        kernel_tex_fetch(__prim_object, prim_addr) :
+                                        object;
+            const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+            if (intersection_skip_self_shadow(ray->self, prim_object, prim)) {
+              continue;
+            }
 
+            switch (type & PRIMITIVE_ALL) {
+              case PRIMITIVE_TRIANGLE: {
                 if (triangle_intersect(
                         kg, isect, P, dir, isect->t, visibility, prim_object, prim, prim_addr)) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
                 }
+                break;
               }
-              break;
-            }
 #if BVH_FEATURE(BVH_MOTION)
-            case PRIMITIVE_MOTION_TRIANGLE: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-
-                const int prim_object = (object == OBJECT_NONE) ?
-                                            kernel_tex_fetch(__prim_object, prim_addr) :
-                                            object;
-                const int prim = kernel_tex_fetch(__prim_index, prim_addr);
-
+              case PRIMITIVE_MOTION_TRIANGLE: {
                 if (motion_triangle_intersect(kg,
                                               isect,
                                               P,
@@ -176,28 +170,21 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
                 }
+                break;
               }
-              break;
-            }
 #endif /* BVH_FEATURE(BVH_MOTION) */
 #if BVH_FEATURE(BVH_HAIR)
-            case PRIMITIVE_CURVE_THICK:
-            case PRIMITIVE_MOTION_CURVE_THICK:
-            case PRIMITIVE_CURVE_RIBBON:
-            case PRIMITIVE_MOTION_CURVE_RIBBON: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
+              case PRIMITIVE_CURVE_THICK:
+              case PRIMITIVE_MOTION_CURVE_THICK:
+              case PRIMITIVE_CURVE_RIBBON:
+              case PRIMITIVE_MOTION_CURVE_RIBBON: {
                 if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
                   const float2 prim_time = kernel_tex_fetch(__prim_time, prim_addr);
                   if (ray->time < prim_time.x || ray->time > prim_time.y) {
-                    continue;
+                    break;
                   }
                 }
 
-                const int prim_object = (object == OBJECT_NONE) ?
-                                            kernel_tex_fetch(__prim_object, prim_addr) :
-                                            object;
-                const int prim = kernel_tex_fetch(__prim_index, prim_addr);
-
                 const int curve_type = kernel_tex_fetch(__prim_type, prim_addr);
                 const bool hit = curve_intersect(
                     kg, isect, P, dir, isect->t, prim_object, prim, ray->time, curve_type);
@@ -206,26 +193,19 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
                 }
+                break;
               }
-              break;
-            }
 #endif /* BVH_FEATURE(BVH_HAIR) */
 #if BVH_FEATURE(BVH_POINTCLOUD)
-            case PRIMITIVE_POINT:
-            case PRIMITIVE_MOTION_POINT: {
-              for (; prim_addr < prim_addr2; prim_addr++) {
+              case PRIMITIVE_POINT:
+              case PRIMITIVE_MOTION_POINT: {
                 if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
                   const float2 prim_time = kernel_tex_fetch(__prim_time, prim_addr);
                   if (ray->time < prim_time.x || ray->time > prim_time.y) {
-                    continue;
+                    break;
                   }
                 }
 
-                const int prim_object = (object == OBJECT_NONE) ?
-                                            kernel_tex_fetch(__prim_object, prim_addr) :
-                                            object;
-                const int prim = kernel_tex_fetch(__prim_index, prim_addr);
-
                 const int point_type = kernel_tex_fetch(__prim_type, prim_addr);
                 const bool hit = point_intersect(
                     kg, isect, P, dir, isect->t, prim_object, prim, ray->time, point_type);
@@ -234,10 +214,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
                 }
+                break;
               }
-              break;
-            }
 #endif /* BVH_FEATURE(BVH_POINTCLOUD) */
+            }
           }
         }
         else {
diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h
index bd79c6e19c6..39c3ecd78c0 100644
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -21,54 +21,22 @@ CCL_NAMESPACE_BEGIN
 /* Ray offset to avoid self intersection.
  *
  * This function should be used to compute a modified ray start position for
- * rays leaving from a surface. */
-
+ * rays leaving from a surface. This is from "A Fast and Robust Method for Avoiding
+ * Self-Intersection" see https://research.nvidia.com/publication/2019-03_A-Fast-and
+ */
 ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 {
-#ifdef __INTERSECTION_REFINE__
-  const float epsilon_f = 1e-5f;
-  /* ideally this should match epsilon_f, but instancing and motion blur
-   * precision makes it problematic */
-  const float epsilon_test = 1.0f;
-  const int epsilon_i = 32;
-
-  float3 res;
-
-  /* x component */
-  if (fabsf(P.x) < epsilon_test) {
-    res.x = P.x + Ng.x * epsilon_f;
-  }
-  else {
-    uint ix = __float_as_uint(P.x);
-    ix += ((ix ^ __float_as_uint(Ng.x)) >> 31) ? -epsilon_i : epsilon_i;
-    res.x = __uint_as_float(ix);
-  }
-
-  /* y component */
-  if (fabsf(P.y) < epsilon_test) {
-    res.y = P.y + Ng.y * epsilon_f;
-  }
-  else {
-    uint iy = __float_as_uint(P.y);
-    iy += ((iy ^ __float_as_uint(Ng.y)) >> 31) ? -epsilon_i : epsilon_i;
-    res.y = __uint_as_float(iy);
-  }
-
-  /* z component */
-  if (fabsf(P.z) < epsilon_test) {
-    res.z = P.z + Ng.z * epsilon_f;
-  }
-  else {
-    uint iz = __float_as_uint(P.z);
-    iz += ((iz ^ __float_as_uint(Ng.z)) >> 31) ? -epsilon_i : epsilon_i;
-    res.z = __uint_as_float(iz);
-  }
-
-  return res;
-#else
-  const float epsilon_f = 1e-4f;
-  return P + epsilon_f * Ng;
-#endif
+  const float int_scale = 256.0f;
+  int3 of_i = make_int3((int)(int_scale * Ng.x), (int)(int_scale * Ng.y), (int)(int_scale * Ng.z));
+
+  float3 p_i = make_float3(__int_as_float(__float_as_int(P.x) + ((P.x < 0) ? -of_i.x : of_i.x)),
+                           __int_as_float(__float_as_int(P.y) + ((P.y < 0) ? -of_i.y : of_i.y)),
+                           __int_as_float(__float_as_int(P.z) + ((P.z < 0) ? -of_i.z : of_i.z)));
+  const float origin = 1.0f / 32.0f;
+  const float float_scale = 1.0f / 65536.0f;
+  return make_float3(fabsf(P.x) < origin ? P.x + float_scale * Ng.x : p_i.x,
+                     fabsf(P.y) < origin ? P.y + float_scale * Ng.y : p_i.y,
+                     fabsf(P.z) < origin ? P.z + float_scale * Ng.z : p_i.z);
 }
 
 #if defined(__KERNEL_CPU__)
@@ -227,4 +195,25 @@ ccl_device_inline float intersection_curve_shadow_transparency(KernelGlobals kg,
   return (1.0f - u) * f0 + u * f1;
 }
 
+ccl_device_inline bool intersection_skip_self(ccl_private const RaySelfPrimitives &self,
+                                              const int object,
+                                              const int prim)
+{
+  return (self.prim == prim) && (self.object == object);
+}
+
+ccl_device_inline bool intersection_skip_self_shadow(ccl_private const RaySelfPrimitives &self,
+                                                     const int object,
+                                                     const int prim)
+{
+  return ((self.prim == prim) && (self.object == object)) ||
+         ((self.light_prim == prim) && (self.light_object == object));
+}
+
+ccl_device_inline bool intersection_skip_self_local(ccl_private const RaySelfPrimitives &self,
+                                                    const int prim)
+{
+  return (self.prim == prim);
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/volume.h b/intern/cycles/kernel/bvh/volume.h
index c0746c8efc3..95bba4f071d 100644
--- a/intern/cycles/kernel/bvh/volume.h
+++ b/intern/cycles/kernel/bvh/volume.h
@@ -144,6 +144,9 @@ ccl_device_inline
                                             kernel_tex_fetch(__prim_object, prim_addr) :
                                             object;
                 const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+                if (intersection_skip_self(ray->self, prim_object, prim)) {
+                  continue;
+                }
 
                 int object_flag = kernel_tex_fetch(__object_flag, prim_object);
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
@@ -164,6 +167,9 @@ ccl_device_inline
                                             kernel_tex_fetch(__prim_object, prim_addr) :
                                             object;
                 const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+                if (intersection_skip_self(ray->self, prim_object, prim)) {
+                  continue;
+                }
                 int object_flag = kernel_tex_fetch(__object_flag, prim_object);
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                   continue;
diff --git a/intern/cycles/kernel/bvh/volume_all.h b/intern/cycles/kernel/bvh/volume_all.h
index a88c9d95d46..9f53e987cf1 100644
--- a/intern/cycles/kernel/bvh/volume_all.h
+++ b/intern/cycles/kernel/bvh/volume_all.h
@@ -147,6 +147,9 @@ ccl_device_inline
                                             kernel_tex_fetch(__prim_object, prim_addr) :
                                             object;
                 const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+                if (intersection_skip_self(ray->self, prim_object, prim)) {
+                  continue;
+                }
                 int object_flag = kernel_tex_fetch(__object_flag, prim_object);
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                   continue;
@@ -188,6 +191,9 @@ ccl_device_inline
                                             kernel_tex_fetch(__prim_object, prim_addr) :
                                             object;
                 const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+                if (intersection_skip_self(ray->self, prim_object, prim)) {
+                  continue;
+                }
                 int object_flag = kernel_tex_fetch(__object_flag, prim_object);
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                   continue;
diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal
index deb7dafe55e..6b77940660f 100644
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -40,6 +40,27 @@ struct TriangleIntersectionResult
 
 enum { METALRT_HIT_TRIANGLE, METALRT_HIT_BOUNDING_BOX };
 
+ccl_device_inline bool intersection_skip_self(ray_data const RaySelfPrimitives& self,
+                                              const int object,
+                                              const int prim)
+{
+  return (self.prim == prim) && (self.object == object);
+}
+
+ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimitives& self,
+                                                     const int object,
+                                                     const int prim)
+{
+  return ((self.prim == prim) && (self.object == object)) ||
+         ((self.light_prim == prim) && (self.light_object == object));
+}
+
+ccl_device_inline bool intersection_skip_self_local(ray_data const RaySelfPrimitives& self,
+                                                    const int prim)
+{
+  return (self.prim == prim);
+}
+
 template<typename TReturn, uint intersection_type>
 TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
                           ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload,
@@ -53,8 +74,8 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
 #ifdef __BVH_LOCAL__
   uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
 
-  if (object != payload.local_object) {
-    /* Only intersect with matching object */
+  if ((object != payload.local_object) || intersection_skip_self_local(payload.self, prim)) {
+    /* Only intersect with matching object and skip self-intersecton. */
     result.accept = false;
     result.continue_search = true;
     return result;
@@ -166,6 +187,11 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
   }
 #  endif
 
+  if (intersection_skip_self_shadow(payload.self, object, prim)) {
+    /* continue search */
+    return true;
+  }
+
   float u = 0.0f, v = 0.0f;
   int type = 0;
   if (intersection_type == METALRT_HIT_TRIANGLE) {
@@ -322,21 +348,35 @@ inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_pa
   }
 #  endif
 
-#  ifdef __VISIBILITY_FLAG__
   uint visibility = payload.visibility;
+#  ifdef __VISIBILITY_FLAG__
   if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
     result.accept = false;
     result.continue_search = true;
     return result;
   }
+#  endif
 
   /* Shadow ray early termination. */
   if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-    result.accept = true;
-    result.continue_search = false;
-    return result;
+    if (intersection_skip_self_shadow(payload.self, object, prim)) {
+      result.accept = false;
+      result.continue_search = true;
+      return result;
+    }
+    else {
+      result.accept = true;
+      result.continue_search = false;
+      return result;
+    }
+  }
+  else {
+    if (intersection_skip_self(payload.self, object, prim)) {
+      result.accept = false;
+      result.continue_search = true;
+      return result;
+    }
   }
-#  endif
 
   result.accept = true;
   result.continue_search = true;
@@ -576,6 +616,150 @@ __intersection__curve_all_shadow(constant KernelParamsMetal &launch_params_metal
 
   return result;
 }
-
 #endif /* __HAIR__ */
+
+#ifdef __POINTCLOUD__
+ccl_device_inline
+void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal,
+                                ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+                                const uint object,
+                                const uint prim,
+                                const uint type,
+                                const float3 ray_origin,
+                                const float3 ray_direction,
+                                float time,
+                                const float ray_tmax,
+                                thread BoundingBoxIntersectionResult &result)
+{
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = payload.visibility;
+  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+    return;
+  }
+#  endif
+
+  float3 P = ray_origin;
+  float3 dir = ray_direction;
+
+  /* The direction is not normalized by default, but the point intersection routine expects that */
+  float len;
+  dir = normalize_len(dir, &len);
+
+  Intersection isect;
+  isect.t = ray_tmax;
+  /* Transform maximum distance into object space. */
+  if (isect.t != FLT_MAX)
+    isect.t *= len;
+
+  MetalKernelContext context(launch_params_metal);
+  if (context.point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+    result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>(
+                  launch_params_metal, payload, object, prim, isect.u);
+    if (result.accept) {
+      result.distance = isect.t / len;
+      payload.u = isect.u;
+      payload.v = isect.v;
+      payload.prim = prim;
+      payload.type = type;
+    }
+  }
+}
+
+ccl_device_inline
+void metalrt_intersection_point_shadow(constant KernelParamsMetal &launch_params_metal,
+                                       ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
+                                       const uint object,
+                                       const uint prim,
+                                       const uint type,
+                                       const float3 ray_origin,
+                                       const float3 ray_direction,
+                                       float time,
+                                       const float ray_tmax,
+                                       thread BoundingBoxIntersectionResult &result)
+{
+  const uint visibility = payload.visibility;
+
+  float3 P = ray_origin;
+  float3 dir = ray_direction;
+
+  /* The direction is not normalized by default, but the point intersection routine expects that */
+  float len;
+  dir = normalize_len(dir, &len);
+
+  Intersection isect;
+  isect.t = ray_tmax;
+  /* Transform maximum distance into object space */
+  if (isect.t != FLT_MAX)
+    isect.t *= len;
+
+  MetalKernelContext context(launch_params_metal);
+  if (context.point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+    result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>(
+                launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
+    result.accept = !result.continue_search;
+
+    if (result.accept) {
+      result.distance = isect.t / len;
+    }
+  }
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__intersection__point(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+                             const uint object [[user_instance_id]],
+                             const uint primitive_id [[primitive_id]],
+                             const float3 ray_origin [[origin]],
+                             const float3 ray_direction [[direction]],
+                             const float ray_tmax [[max_distance]])
+{
+  const uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+  const int type = kernel_tex_fetch(__objects, object).primitive_type;
+
+  BoundingBoxIntersectionResult result;
+  result.accept = false;
+  result.continue_search = true;
+  result.distance = ray_tmax;
+
+  metalrt_intersection_point(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction,
+#  if defined(__METALRT_MOTION__)
+                             payload.time,
+#  else
+                             0.0f,
+#  endif
+                             ray_tmax, result);
+
+  return result;
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__intersection__point_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+                                    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+                                    const uint object [[user_instance_id]],
+                                    const uint primitive_id [[primitive_id]],
+                                    const float3 ray_origin [[origin]],
+                                    const float3 ray_direction [[direction]],
+                                    const float ray_tmax [[max_distance]])
+{
+  const uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+  const int type = kernel_tex_fetch(__objects, object).primitive_type;
+
+  BoundingBoxIntersectionResult result;
+  result.accept = false;
+  result.continue_search = true;
+  result.distance = ray_tmax;
+
+  metalrt_intersection_point_shadow(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction,
+#  if defined(__METALRT_MOTION__)
+                             payload.time,
+#  else
+                             0.0f,
+#  endif
+                             ray_tmax, result);
+
+  return result;
+}
+#endif /* __POINTCLOUD__ */
 #endif /* __METALRT__ */
diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index aa210b31a95..8e3d57bff8a 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -45,6 +45,11 @@ template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
   return pointer_unpack_from_uint<T>(optixGetPayload_2(), optixGetPayload_3());
 }
 
+template<typename T> ccl_device_forceinline T *get_payload_ptr_6()
+{
+  return (T *)(((uint64_t)optixGetPayload_7() << 32) | optixGetPayload_6());
+}
+
 ccl_device_forceinline int get_object_id()
 {
 #ifdef __OBJECT_MOTION__
@@ -111,6 +116,12 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
     return optixIgnoreIntersection();
   }
 
+  const int prim = optixGetPrimitiveIndex();
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+  if (intersection_skip_self_local(ray->self, prim)) {
+    return optixIgnoreIntersection();
+  }
+
   const uint max_hits = optixGetPayload_5();
   if (max_hits == 0) {
     /* Special case for when no hit information is requested, just report that something was hit */
@@ -149,8 +160,6 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
     local_isect->num_hits = 1;
   }
 
-  const int prim = optixGetPrimitiveIndex();
-
   Intersection *isect = &local_isect->hits[hit];
   isect->t = optixGetRayTmax();
   isect->prim = prim;
@@ -185,6 +194,11 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
   }
 #  endif
 
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+  if (intersection_skip_self_shadow(ray->self, object, prim)) {
+    return optixIgnoreIntersection();
+  }
+
   float u = 0.0f, v = 0.0f;
   int type = 0;
   if (optixIsTriangleHit()) {
@@ -314,6 +328,12 @@ extern "C" __global__ void __anyhit__kernel_optix_volume_test()
   if ((kernel_tex_fetch(__object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) {
     return optixIgnoreIntersection();
   }
+
+  const int prim = optixGetPrimitiveIndex();
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+  if (intersection_skip_self(ray->self, object, prim)) {
+    return optixIgnoreIntersection();
+  }
 }
 
 extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
@@ -330,18 +350,31 @@ extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
 #  endif
 #endif
 
-#ifdef __VISIBILITY_FLAG__
   const uint object = get_object_id();
   const uint visibility = optixGetPayload_4();
+#ifdef __VISIBILITY_FLAG__
   if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
     return optixIgnoreIntersection();
   }
+#endif
+
+  const int prim = optixGetPrimitiveIndex();
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
 
-  /* Shadow ray early termination. */
   if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-    return optixTerminateRay();
+    if (intersection_skip_self_shadow(ray->self, object, prim)) {
+      return optixIgnoreIntersection();
+    }
+    else {
+      /* Shadow ray early termination. */
+      return optixTerminateRay();
+    }
+  }
+  else {
+    if (intersection_skip_self(ray->self, object, prim)) {
+      return optixIgnoreIntersection();
+    }
   }
-#endif
 }
 
 extern "C" __global__ void __closesthit__kernel_optix_hit()
diff --git a/intern/cycles/kernel/film/read.h b/intern/cycles/kernel/film/read.h
index 18a593a75b1..ba895fd8909 100644
--- a/intern/cycles/kernel/film/read.h
+++ b/intern/cycles/kernel/film/read.h
@@ -214,6 +214,21 @@ ccl_device_inline void film_get_pass_pixel_light_path(
   pixel[0] = f.x;
   pixel[1] = f.y;
   pixel[2] = f.z;
+
+  /* Optional alpha channel. */
+  if (kfilm_convert->num_components >= 4) {
+    if (kfilm_convert->pass_combined != PASS_UNUSED) {
+      float scale, scale_exposure;
+      film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure);
+
+      ccl_global const float *in_combined = buffer + kfilm_convert->pass_combined;
+      const float alpha = in_combined[3] * scale;
+      pixel[3] = film_transparency_to_alpha(alpha);
+    }
+    else {
+      pixel[3] = 1.0f;
+    }
+  }
 }
 
 ccl_device_inline void film_get_pass_pixel_float3(ccl_global const KernelFilmConvert *ccl_restrict
diff --git a/intern/cycles/kernel/geom/curve.h b/intern/cycles/kernel/geom/curve.h
index 8a63f01643b..48ee8226e89 100644
--- a/intern/cycles/kernel/geom/curve.h
+++ b/intern/cycles/kernel/geom/curve.h
@@ -226,6 +226,18 @@ ccl_device float curve_thickness(KernelGlobals kg, ccl_private const ShaderData
   return r * 2.0f;
 }
 
+/* Curve random */
+
+ccl_device float curve_random(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  if (sd->type & PRIMITIVE_CURVE) {
+    const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_CURVE_RANDOM);
+    return (desc.offset != ATTR_STD_NOT_FOUND) ? curve_attribute_float(kg, sd, desc, NULL, NULL) :
+                                                 0.0f;
+  }
+  return 0.0f;
+}
+
 /* Curve location for motion pass, linear interpolation between keys and
  * ignoring radius because we do the same for the motion keys */
 
diff --git a/intern/cycles/kernel/geom/motion_triangle_intersect.h b/intern/cycles/kernel/geom/motion_triangle_intersect.h
index cb6d210d90f..a11cb88385b 100644
--- a/intern/cycles/kernel/geom/motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/motion_triangle_intersect.h
@@ -29,46 +29,19 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Refine triangle intersection to more precise hit point. For rays that travel
- * far the precision is often not so good, this reintersects the primitive from
- * a closer distance.
+/**
+ * Use the barycentric coordinates to get the intersection location
  */
-
-ccl_device_inline float3 motion_triangle_refine(KernelGlobals kg,
-                                                ccl_private ShaderData *sd,
-                                                float3 P,
-                                                float3 D,
-                                                float t,
-                                                const int isect_object,
-                                                const int isect_prim,
-                                                float3 verts[3])
+ccl_device_inline float3 motion_triangle_point_from_uv(KernelGlobals kg,
+                                                       ccl_private ShaderData *sd,
+                                                       const int isect_object,
+                                                       const int isect_prim,
+                                                       const float u,
+                                                       const float v,
+                                                       float3 verts[3])
 {
-#ifdef __INTERSECTION_REFINE__
-  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-    if (UNLIKELY(t == 0.0f)) {
-      return P;
-    }
-    const Transform tfm = object_get_inverse_transform(kg, sd);
-
-    P = transform_point(&tfm, P);
-    D = transform_direction(&tfm, D * t);
-    D = normalize_len(D, &t);
-  }
-
-  P = P + D * t;
-
-  /* Compute refined intersection distance. */
-  const float3 e1 = verts[0] - verts[2];
-  const float3 e2 = verts[1] - verts[2];
-  const float3 s1 = cross(D, e2);
-
-  const float invdivisor = 1.0f / dot(s1, e1);
-  const float3 d = P - verts[2];
-  const float3 s2 = cross(d, e1);
-  float rt = dot(e2, s2) * invdivisor;
-
-  /* Compute refined position. */
-  P = P + D * rt;
+  float w = 1.0f - u - v;
+  float3 P = u * verts[0] + v * verts[1] + w * verts[2];
 
   if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
     const Transform tfm = object_get_transform(kg, sd);
@@ -76,71 +49,8 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals kg,
   }
 
   return P;
-#else
-  return P + D * t;
-#endif
 }
 
-/* Same as above, except that t is assumed to be in object space
- * for instancing.
- */
-
-#ifdef __BVH_LOCAL__
-#  if defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86))
-ccl_device_noinline
-#  else
-ccl_device_inline
-#  endif
-    float3
-    motion_triangle_refine_local(KernelGlobals kg,
-                                 ccl_private ShaderData *sd,
-                                 float3 P,
-                                 float3 D,
-                                 float t,
-                                 const int isect_object,
-                                 const int isect_prim,
-                                 float3 verts[3])
-{
-#  if defined(__KERNEL_GPU_RAYTRACING__)
-  /* t is always in world space with OptiX and MetalRT. */
-  return motion_triangle_refine(kg, sd, P, D, t, isect_object, isect_prim, verts);
-#  else
-#    ifdef __INTERSECTION_REFINE__
-  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-    const Transform tfm = object_get_inverse_transform(kg, sd);
-
-    P = transform_point(&tfm, P);
-    D = transform_direction(&tfm, D);
-    D = normalize(D);
-  }
-
-  P = P + D * t;
-
-  /* compute refined intersection distance */
-  const float3 e1 = verts[0] - verts[2];
-  const float3 e2 = verts[1] - verts[2];
-  const float3 s1 = cross(D, e2);
-
-  const float invdivisor = 1.0f / dot(s1, e1);
-  const float3 d = P - verts[2];
-  const float3 s2 = cross(d, e1);
-  float rt = dot(e2, s2) * invdivisor;
-
-  P = P + D * rt;
-
-  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-    const Transform tfm = object_get_transform(kg, sd);
-    P = transform_point(&tfm, P);
-  }
-
-  return P;
-#    else  /* __INTERSECTION_REFINE__ */
-  return P + D * t;
-#    endif /* __INTERSECTION_REFINE__ */
-#  endif
-}
-#endif /* __BVH_LOCAL__ */
-
 /* Ray intersection. We simply compute the vertex positions at the given ray
  * time and do a ray intersection with the resulting triangle.
  */
diff --git a/intern/cycles/kernel/geom/motion_triangle_shader.h b/intern/cycles/kernel/geom/motion_triangle_shader.h
index fc7c181882e..15730c83969 100644
--- a/intern/cycles/kernel/geom/motion_triangle_shader.h
+++ b/intern/cycles/kernel/geom/motion_triangle_shader.h
@@ -68,15 +68,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals kg,
   verts[1] = (1.0f - t) * verts[1] + t * next_verts[1];
   verts[2] = (1.0f - t) * verts[2] + t * next_verts[2];
   /* Compute refined position. */
-#ifdef __BVH_LOCAL__
-  if (is_local) {
-    sd->P = motion_triangle_refine_local(kg, sd, P, D, ray_t, isect_object, isect_prim, verts);
-  }
-  else
-#endif /* __BVH_LOCAL__*/
-  {
-    sd->P = motion_triangle_refine(kg, sd, P, D, ray_t, isect_object, isect_prim, verts);
-  }
+  sd->P = motion_triangle_point_from_uv(kg, sd, isect_object, isect_prim, sd->u, sd->v, verts);
   /* Compute face normal. */
   float3 Ng;
   if (sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
diff --git a/intern/cycles/kernel/geom/point.h b/intern/cycles/kernel/geom/point.h
index 52a1e77d71a..545b5c7fa43 100644
--- a/intern/cycles/kernel/geom/point.h
+++ b/intern/cycles/kernel/geom/point.h
@@ -81,7 +81,7 @@ ccl_device float3 point_attribute_float3(KernelGlobals kg,
 #  endif
 
   if (desc.element == ATTR_ELEMENT_VERTEX) {
-    return float4_to_float3(kernel_tex_fetch(__attributes_float4, desc.offset + sd->prim));
+    return kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim);
   }
   else {
     return make_float3(0.0f, 0.0f, 0.0f);
@@ -109,17 +109,59 @@ ccl_device float4 point_attribute_float4(KernelGlobals kg,
   }
 }
 
+/* Point position */
+
+ccl_device float3 point_position(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  if (sd->type & PRIMITIVE_POINT) {
+    /* World space center. */
+    float3 P = (sd->type & PRIMITIVE_MOTION) ?
+                   float4_to_float3(motion_point(kg, sd->object, sd->prim, sd->time)) :
+                   float4_to_float3(kernel_tex_fetch(__points, sd->prim));
+
+    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+      object_position_transform(kg, sd, &P);
+    }
+
+    return P;
+  }
+
+  return zero_float3();
+}
+
 /* Point radius */
 
 ccl_device float point_radius(KernelGlobals kg, ccl_private const ShaderData *sd)
 {
   if (sd->type & PRIMITIVE_POINT) {
-    return kernel_tex_fetch(__points, sd->prim).w;
+    /* World space radius. */
+    const float r = kernel_tex_fetch(__points, sd->prim).w;
+
+    if (sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED) {
+      return r;
+    }
+    else {
+      float3 dir = make_float3(r, r, r);
+      object_dir_transform(kg, sd, &dir);
+      return average(dir);
+    }
   }
 
   return 0.0f;
 }
 
+/* Point random */
+
+ccl_device float point_random(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  if (sd->type & PRIMITIVE_POINT) {
+    const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_POINT_RANDOM);
+    return (desc.offset != ATTR_STD_NOT_FOUND) ? point_attribute_float(kg, sd, desc, NULL, NULL) :
+                                                 0.0f;
+  }
+  return 0.0f;
+}
+
 /* Point location for motion pass, linear interpolation between keys and
  * ignoring radius because we do the same for the motion keys */
 
diff --git a/intern/cycles/kernel/geom/shader_data.h b/intern/cycles/kernel/geom/shader_data.h
index f5055d8b285..fdf914d85e0 100644
--- a/intern/cycles/kernel/geom/shader_data.h
+++ b/intern/cycles/kernel/geom/shader_data.h
@@ -89,7 +89,7 @@ ccl_device_inline void shader_setup_from_ray(KernelGlobals kg,
       sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
 
       /* vectors */
-      sd->P = triangle_refine(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
+      sd->P = triangle_point_from_uv(kg, sd, isect->object, isect->prim, isect->u, isect->v);
       sd->Ng = Ng;
       sd->N = Ng;
 
@@ -190,40 +190,46 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals kg,
 #ifdef __OBJECT_MOTION__
     shader_setup_object_transforms(kg, sd, time);
 #endif
-  }
-  else if (lamp != LAMP_NONE) {
-    sd->lamp = lamp;
-  }
 
-  /* transform into world space */
-  if (object_space) {
-    object_position_transform_auto(kg, sd, &sd->P);
-    object_normal_transform_auto(kg, sd, &sd->Ng);
-    sd->N = sd->Ng;
-    object_dir_transform_auto(kg, sd, &sd->I);
-  }
+    /* transform into world space */
+    if (object_space) {
+      object_position_transform_auto(kg, sd, &sd->P);
+      object_normal_transform_auto(kg, sd, &sd->Ng);
+      sd->N = sd->Ng;
+      object_dir_transform_auto(kg, sd, &sd->I);
+    }
 
-  if (sd->type == PRIMITIVE_TRIANGLE) {
-    /* smooth normal */
-    if (sd->shader & SHADER_SMOOTH_NORMAL) {
-      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
+    if (sd->type == PRIMITIVE_TRIANGLE) {
+      /* smooth normal */
+      if (sd->shader & SHADER_SMOOTH_NORMAL) {
+        sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
 
-      if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-        object_normal_transform_auto(kg, sd, &sd->N);
+        if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+          object_normal_transform_auto(kg, sd, &sd->N);
+        }
       }
-    }
 
-    /* dPdu/dPdv */
+      /* dPdu/dPdv */
 #ifdef __DPDU__
-    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+      triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 
-    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-      object_dir_transform_auto(kg, sd, &sd->dPdu);
-      object_dir_transform_auto(kg, sd, &sd->dPdv);
+      if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+        object_dir_transform_auto(kg, sd, &sd->dPdu);
+        object_dir_transform_auto(kg, sd, &sd->dPdv);
+      }
+#endif
     }
+    else {
+#ifdef __DPDU__
+      sd->dPdu = zero_float3();
+      sd->dPdv = zero_float3();
 #endif
+    }
   }
   else {
+    if (lamp != LAMP_NONE) {
+      sd->lamp = lamp;
+    }
 #ifdef __DPDU__
     sd->dPdu = zero_float3();
     sd->dPdv = zero_float3();
diff --git a/intern/cycles/kernel/geom/triangle_intersect.h b/intern/cycles/kernel/geom/triangle_intersect.h
index 0169b40bc34..8458cf020a0 100644
--- a/intern/cycles/kernel/geom/triangle_intersect.h
+++ b/intern/cycles/kernel/geom/triangle_intersect.h
@@ -142,116 +142,23 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg,
 }
 #endif /* __BVH_LOCAL__ */
 
-/* Refine triangle intersection to more precise hit point. For rays that travel
- * far the precision is often not so good, this reintersects the primitive from
- * a closer distance. */
-
-/* Reintersections uses the paper:
- *
- * Tomas Moeller
- * Fast, minimum storage ray/triangle intersection
- * http://www.cs.virginia.edu/~gfx/Courses/2003/ImageSynthesis/papers/Acceleration/Fast%20MinimumStorage%20RayTriangle%20Intersection.pdf
+/**
+ * Use the barycentric coordinates to get the intersection location
  */
-
-ccl_device_inline float3 triangle_refine(KernelGlobals kg,
-                                         ccl_private ShaderData *sd,
-                                         float3 P,
-                                         float3 D,
-                                         float t,
-                                         const int isect_object,
-                                         const int isect_prim)
+ccl_device_inline float3 triangle_point_from_uv(KernelGlobals kg,
+                                                ccl_private ShaderData *sd,
+                                                const int isect_object,
+                                                const int isect_prim,
+                                                const float u,
+                                                const float v)
 {
-#ifdef __INTERSECTION_REFINE__
-  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-    if (UNLIKELY(t == 0.0f)) {
-      return P;
-    }
-    const Transform tfm = object_get_inverse_transform(kg, sd);
-
-    P = transform_point(&tfm, P);
-    D = transform_direction(&tfm, D * t);
-    D = normalize_len(D, &t);
-  }
-
-  P = P + D * t;
-
   const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect_prim).w;
   const packed_float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
                       tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
                       tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
-  float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
-  float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
-  float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
-  float3 qvec = cross(tvec, edge1);
-  float3 pvec = cross(D, edge2);
-  float det = dot(edge1, pvec);
-  if (det != 0.0f) {
-    /* If determinant is zero it means ray lies in the plane of
-     * the triangle. It is possible in theory due to watertight
-     * nature of triangle intersection. For such cases we simply
-     * don't refine intersection hoping it'll go all fine.
-     */
-    float rt = dot(edge2, qvec) / det;
-    P = P + D * rt;
-  }
-
-  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-    const Transform tfm = object_get_transform(kg, sd);
-    P = transform_point(&tfm, P);
-  }
-
-  return P;
-#else
-  return P + D * t;
-#endif
-}
-
-/* Same as above, except that t is assumed to be in object space for
- * instancing.
- */
-ccl_device_inline float3 triangle_refine_local(KernelGlobals kg,
-                                               ccl_private ShaderData *sd,
-                                               float3 P,
-                                               float3 D,
-                                               float t,
-                                               const int isect_object,
-                                               const int isect_prim)
-{
-#if defined(__KERNEL_GPU_RAYTRACING__)
-  /* t is always in world space with OptiX and MetalRT. */
-  return triangle_refine(kg, sd, P, D, t, isect_object, isect_prim);
-#else
-  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-    const Transform tfm = object_get_inverse_transform(kg, sd);
-
-    P = transform_point(&tfm, P);
-    D = transform_direction(&tfm, D);
-    D = normalize(D);
-  }
+  float w = 1.0f - u - v;
 
-  P = P + D * t;
-
-#  ifdef __INTERSECTION_REFINE__
-  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect_prim).w;
-  const packed_float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
-                      tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
-                      tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
-  float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
-  float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
-  float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
-  float3 qvec = cross(tvec, edge1);
-  float3 pvec = cross(D, edge2);
-  float det = dot(edge1, pvec);
-  if (det != 0.0f) {
-    /* If determinant is zero it means ray lies in the plane of
-     * the triangle. It is possible in theory due to watertight
-     * nature of triangle intersection. For such cases we simply
-     * don't refine intersection hoping it'll go all fine.
-     */
-    float rt = dot(edge2, qvec) / det;
-    P = P + D * rt;
-  }
-#  endif /* __INTERSECTION_REFINE__ */
+  float3 P = u * tri_a + v * tri_b + w * tri_c;
 
   if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
     const Transform tfm = object_get_transform(kg, sd);
@@ -259,7 +166,6 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals kg,
   }
 
   return P;
-#endif
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/intersect_closest.h b/intern/cycles/kernel/integrator/intersect_closest.h
index df710dc1d82..4c5265189fa 100644
--- a/intern/cycles/kernel/integrator/intersect_closest.h
+++ b/intern/cycles/kernel/integrator/intersect_closest.h
@@ -328,6 +328,12 @@ ccl_device void integrator_intersect_closest(KernelGlobals kg,
 
   /* Scene Intersection. */
   Intersection isect ccl_optional_struct_init;
+  isect.object = OBJECT_NONE;
+  isect.prim = PRIM_NONE;
+  ray.self.object = last_isect_object;
+  ray.self.prim = last_isect_prim;
+  ray.self.light_object = OBJECT_NONE;
+  ray.self.light_prim = PRIM_NONE;
   bool hit = scene_intersect(kg, &ray, visibility, &isect);
 
   /* TODO: remove this and do it in the various intersection functions instead. */
diff --git a/intern/cycles/kernel/integrator/intersect_shadow.h b/intern/cycles/kernel/integrator/intersect_shadow.h
index 90422445fad..1ba8724826b 100644
--- a/intern/cycles/kernel/integrator/intersect_shadow.h
+++ b/intern/cycles/kernel/integrator/intersect_shadow.h
@@ -156,7 +156,10 @@ ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorShadowSt
   /* Read ray from integrator state into local memory. */
   Ray ray ccl_optional_struct_init;
   integrator_state_read_shadow_ray(kg, state, &ray);
-
+  ray.self.object = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, object);
+  ray.self.prim = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, prim);
+  ray.self.light_object = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 1, object);
+  ray.self.light_prim = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 1, prim);
   /* Compute visibility. */
   const uint visibility = integrate_intersect_shadow_visibility(kg, state);
 
diff --git a/intern/cycles/kernel/integrator/intersect_volume_stack.h b/intern/cycles/kernel/integrator/intersect_volume_stack.h
index 9fa5ff63ad2..ee3d82ebacb 100644
--- a/intern/cycles/kernel/integrator/intersect_volume_stack.h
+++ b/intern/cycles/kernel/integrator/intersect_volume_stack.h
@@ -38,7 +38,10 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
   Ray volume_ray ccl_optional_struct_init;
   volume_ray.P = from_P;
   volume_ray.D = normalize_len(to_P - from_P, &volume_ray.t);
-
+  volume_ray.self.object = INTEGRATOR_STATE(state, isect, object);
+  volume_ray.self.prim = INTEGRATOR_STATE(state, isect, prim);
+  volume_ray.self.light_object = OBJECT_NONE;
+  volume_ray.self.light_prim = PRIM_NONE;
   /* Store to avoid global fetches on every intersection step. */
   const uint volume_stack_size = kernel_data.volume_stack_size;
 
@@ -68,7 +71,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
     volume_stack_enter_exit(kg, state, stack_sd);
 
     /* Move ray forward. */
-    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
+    volume_ray.P = stack_sd->P;
     if (volume_ray.t != FLT_MAX) {
       volume_ray.D = normalize_len(to_P - volume_ray.P, &volume_ray.t);
     }
@@ -91,6 +94,10 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s
    * fewest hits. */
   volume_ray.D = make_float3(0.0f, 0.0f, 1.0f);
   volume_ray.t = FLT_MAX;
+  volume_ray.self.object = OBJECT_NONE;
+  volume_ray.self.prim = PRIM_NONE;
+  volume_ray.self.light_object = OBJECT_NONE;
+  volume_ray.self.light_prim = PRIM_NONE;
 
   int stack_index = 0, enclosed_index = 0;
 
@@ -203,7 +210,7 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s
     }
 
     /* Move ray forward. */
-    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
+    volume_ray.P = stack_sd->P;
     ++step;
   }
 #endif
diff --git a/intern/cycles/kernel/integrator/shade_light.h b/intern/cycles/kernel/integrator/shade_light.h
index 97ca430752c..0a82c9cadef 100644
--- a/intern/cycles/kernel/integrator/shade_light.h
+++ b/intern/cycles/kernel/integrator/shade_light.h
@@ -37,8 +37,9 @@ ccl_device_inline void integrate_light(KernelGlobals kg,
 
   /* Advance ray beyond light. */
   /* TODO: can we make this more numerically robust to avoid reintersecting the
-   * same light in some cases? */
-  const float3 new_ray_P = ray_offset(ray_P + ray_D * isect.t, ray_D);
+   * same light in some cases? Ray should not intersect surface anymore as the
+   * object and prim ids will prevent self intersection. */
+  const float3 new_ray_P = ray_P + ray_D * isect.t;
   INTEGRATOR_STATE_WRITE(state, ray, P) = new_ray_P;
   INTEGRATOR_STATE_WRITE(state, ray, t) -= isect.t;
 
@@ -46,7 +47,7 @@ ccl_device_inline void integrate_light(KernelGlobals kg,
   const float mis_ray_t = INTEGRATOR_STATE(state, path, mis_ray_t);
   ray_P -= ray_D * mis_ray_t;
   isect.t += mis_ray_t;
-  INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = mis_ray_t + isect.t;
+  INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = isect.t;
 
   LightSample ls ccl_optional_struct_init;
   const bool use_light_sample = light_sample_from_intersection(kg, &isect, ray_P, ray_D, &ls);
diff --git a/intern/cycles/kernel/integrator/shade_shadow.h b/intern/cycles/kernel/integrator/shade_shadow.h
index a68fcaa7a64..3e8eba29ef7 100644
--- a/intern/cycles/kernel/integrator/shade_shadow.h
+++ b/intern/cycles/kernel/integrator/shade_shadow.h
@@ -83,7 +83,10 @@ ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg,
   /* Setup shader data. */
   Ray ray ccl_optional_struct_init;
   integrator_state_read_shadow_ray(kg, state, &ray);
-
+  ray.self.object = OBJECT_NONE;
+  ray.self.prim = PRIM_NONE;
+  ray.self.light_object = OBJECT_NONE;
+  ray.self.light_prim = PRIM_NONE;
   /* Modify ray position and length to match current segment. */
   const float start_t = (hit == 0) ? 0.0f :
                                      INTEGRATOR_STATE_ARRAY(state, shadow_isect, hit - 1, t);
@@ -149,7 +152,7 @@ ccl_device_inline bool integrate_transparent_shadow(KernelGlobals kg,
     const float last_hit_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, num_recorded_hits - 1, t);
     const float3 ray_P = INTEGRATOR_STATE(state, shadow_ray, P);
     const float3 ray_D = INTEGRATOR_STATE(state, shadow_ray, D);
-    INTEGRATOR_STATE_WRITE(state, shadow_ray, P) = ray_offset(ray_P + last_hit_t * ray_D, ray_D);
+    INTEGRATOR_STATE_WRITE(state, shadow_ray, P) = ray_P + last_hit_t * ray_D;
     INTEGRATOR_STATE_WRITE(state, shadow_ray, t) -= last_hit_t;
   }
 
diff --git a/intern/cycles/kernel/integrator/shade_surface.h b/intern/cycles/kernel/integrator/shade_surface.h
index 3d5b65458c7..10d3cbf7f57 100644
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -182,23 +182,35 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
 
   /* Write shadow ray and associated state to global memory. */
   integrator_state_write_shadow_ray(kg, shadow_state, &ray);
+  // Save memory by storing the light and object indices in the shadow_isect
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_isect, 0, object) = ray.self.object;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_isect, 0, prim) = ray.self.prim;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_isect, 1, object) = ray.self.light_object;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_isect, 1, prim) = ray.self.light_prim;
 
   /* Copy state from main path to shadow path. */
   const uint16_t bounce = INTEGRATOR_STATE(state, path, bounce);
   const uint16_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce);
   uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag);
   shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
-  shadow_flag |= PATH_RAY_SURFACE_PASS;
   const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * bsdf_eval_sum(&bsdf_eval);
 
   if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    const packed_float3 pass_diffuse_weight =
-        (bounce == 0) ? packed_float3(bsdf_eval_pass_diffuse_weight(&bsdf_eval)) :
-                        INTEGRATOR_STATE(state, path, pass_diffuse_weight);
-    const packed_float3 pass_glossy_weight = (bounce == 0) ?
-                                                 packed_float3(
-                                                     bsdf_eval_pass_glossy_weight(&bsdf_eval)) :
-                                                 INTEGRATOR_STATE(state, path, pass_glossy_weight);
+    packed_float3 pass_diffuse_weight;
+    packed_float3 pass_glossy_weight;
+
+    if (shadow_flag & PATH_RAY_ANY_PASS) {
+      /* Indirect bounce, use weights from earlier surface or volume bounce. */
+      pass_diffuse_weight = INTEGRATOR_STATE(state, path, pass_diffuse_weight);
+      pass_glossy_weight = INTEGRATOR_STATE(state, path, pass_glossy_weight);
+    }
+    else {
+      /* Direct light, use BSDFs at this bounce. */
+      shadow_flag |= PATH_RAY_SURFACE_PASS;
+      pass_diffuse_weight = packed_float3(bsdf_eval_pass_diffuse_weight(&bsdf_eval));
+      pass_glossy_weight = packed_float3(bsdf_eval_pass_glossy_weight(&bsdf_eval));
+    }
+
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = pass_glossy_weight;
   }
@@ -266,13 +278,11 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
   }
 
   /* Setup ray. Note that clipping works through transparent bounces. */
-  INTEGRATOR_STATE_WRITE(state, ray, P) = ray_offset(sd->P,
-                                                     (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
+  INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
   INTEGRATOR_STATE_WRITE(state, ray, D) = normalize(bsdf_omega_in);
   INTEGRATOR_STATE_WRITE(state, ray, t) = (label & LABEL_TRANSPARENT) ?
                                               INTEGRATOR_STATE(state, ray, t) - sd->ray_length :
                                               FLT_MAX;
-
 #ifdef __RAY_DIFFERENTIALS__
   INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP);
   INTEGRATOR_STATE_WRITE(state, ray, dD) = differential_make_compact(bsdf_domega_in);
@@ -316,7 +326,7 @@ ccl_device_forceinline bool integrate_surface_volume_only_bounce(IntegratorState
   }
 
   /* Setup ray position, direction stays unchanged. */
-  INTEGRATOR_STATE_WRITE(state, ray, P) = ray_offset(sd->P, -sd->Ng);
+  INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
 
   /* Clipping works through transparent. */
   INTEGRATOR_STATE_WRITE(state, ray, t) -= sd->ray_length;
@@ -360,10 +370,14 @@ ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg,
   }
 
   Ray ray ccl_optional_struct_init;
-  ray.P = ray_offset(sd->P, sd->Ng);
+  ray.P = sd->P;
   ray.D = ao_D;
   ray.t = kernel_data.integrator.ao_bounces_distance;
   ray.time = sd->time;
+  ray.self.object = sd->object;
+  ray.self.prim = sd->prim;
+  ray.self.light_object = OBJECT_NONE;
+  ray.self.light_prim = PRIM_NONE;
   ray.dP = differential_zero_compact();
   ray.dD = differential_zero_compact();
 
@@ -375,6 +389,10 @@ ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg,
 
   /* Write shadow ray and associated state to global memory. */
   integrator_state_write_shadow_ray(kg, shadow_state, &ray);
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_isect, 0, object) = ray.self.object;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_isect, 0, prim) = ray.self.prim;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_isect, 1, object) = ray.self.light_object;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_isect, 1, prim) = ray.self.light_prim;
 
   /* Copy state from main path to shadow path. */
   const uint16_t bounce = INTEGRATOR_STATE(state, path, bounce);
diff --git a/intern/cycles/kernel/integrator/shade_volume.h b/intern/cycles/kernel/integrator/shade_volume.h
index 712c22357b8..c59234553a7 100644
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -791,22 +791,36 @@ ccl_device_forceinline void integrate_volume_direct_light(
 
   /* Write shadow ray and associated state to global memory. */
   integrator_state_write_shadow_ray(kg, shadow_state, &ray);
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_isect, 0, object) = ray.self.object;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_isect, 0, prim) = ray.self.prim;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_isect, 1, object) = ray.self.light_object;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_isect, 1, prim) = ray.self.light_prim;
 
   /* Copy state from main path to shadow path. */
   const uint16_t bounce = INTEGRATOR_STATE(state, path, bounce);
   const uint16_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce);
   uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag);
   shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
-  shadow_flag |= PATH_RAY_VOLUME_PASS;
   const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval);
 
   if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    const packed_float3 pass_diffuse_weight = (bounce == 0) ?
-                                                  packed_float3(one_float3()) :
-                                                  INTEGRATOR_STATE(
-                                                      state, path, pass_diffuse_weight);
+    packed_float3 pass_diffuse_weight;
+    packed_float3 pass_glossy_weight;
+
+    if (shadow_flag & PATH_RAY_ANY_PASS) {
+      /* Indirect bounce, use weights from earlier surface or volume bounce. */
+      pass_diffuse_weight = INTEGRATOR_STATE(state, path, pass_diffuse_weight);
+      pass_glossy_weight = INTEGRATOR_STATE(state, path, pass_glossy_weight);
+    }
+    else {
+      /* Direct light, no diffuse/glossy distinction needed for volumes. */
+      shadow_flag |= PATH_RAY_VOLUME_PASS;
+      pass_diffuse_weight = packed_float3(one_float3());
+      pass_glossy_weight = packed_float3(zero_float3());
+    }
+
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = zero_float3();
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = pass_glossy_weight;
   }
 
   INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
@@ -873,11 +887,13 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(
   INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
   INTEGRATOR_STATE_WRITE(state, ray, D) = normalize(phase_omega_in);
   INTEGRATOR_STATE_WRITE(state, ray, t) = FLT_MAX;
-
 #  ifdef __RAY_DIFFERENTIALS__
   INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP);
   INTEGRATOR_STATE_WRITE(state, ray, dD) = differential_make_compact(phase_domega_in);
 #  endif
+  // Save memory by storing last hit prim and object in isect
+  INTEGRATOR_STATE_WRITE(state, isect, prim) = sd->prim;
+  INTEGRATOR_STATE_WRITE(state, isect, object) = sd->object;
 
   /* Update throughput. */
   const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
diff --git a/intern/cycles/kernel/integrator/shadow_state_template.h b/intern/cycles/kernel/integrator/shadow_state_template.h
index 625a429d3db..86fcabdcd82 100644
--- a/intern/cycles/kernel/integrator/shadow_state_template.h
+++ b/intern/cycles/kernel/integrator/shadow_state_template.h
@@ -61,6 +61,7 @@ KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, int, object, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_END(shadow_ray)
 
 /*********************** Shadow Intersection result **************************/
diff --git a/intern/cycles/kernel/integrator/subsurface.h b/intern/cycles/kernel/integrator/subsurface.h
index 59b0cd2596c..6c0f815afea 100644
--- a/intern/cycles/kernel/integrator/subsurface.h
+++ b/intern/cycles/kernel/integrator/subsurface.h
@@ -57,7 +57,6 @@ ccl_device int subsurface_bounce(KernelGlobals kg,
 
   /* Pass along object info, reusing isect to save memory. */
   INTEGRATOR_STATE_WRITE(state, subsurface, Ng) = sd->Ng;
-  INTEGRATOR_STATE_WRITE(state, isect, object) = sd->object;
 
   uint32_t path_flag = (INTEGRATOR_STATE(state, path, flag) & ~PATH_RAY_CAMERA) |
                        ((sc->type == CLOSURE_BSSRDF_BURLEY_ID) ? PATH_RAY_SUBSURFACE_DISK :
@@ -165,10 +164,8 @@ ccl_device_inline bool subsurface_scatter(KernelGlobals kg, IntegratorState stat
 
     if (object_flag & SD_OBJECT_INTERSECTS_VOLUME) {
       float3 P = INTEGRATOR_STATE(state, ray, P);
-      const float3 Ng = INTEGRATOR_STATE(state, subsurface, Ng);
-      const float3 offset_P = ray_offset(P, -Ng);
 
-      integrator_volume_stack_update_for_subsurface(kg, state, offset_P, ray.P);
+      integrator_volume_stack_update_for_subsurface(kg, state, P, ray.P);
     }
   }
 #  endif /* __VOLUME__ */
diff --git a/intern/cycles/kernel/integrator/subsurface_disk.h b/intern/cycles/kernel/integrator/subsurface_disk.h
index cc6f5048cda..f5641d1fa5e 100644
--- a/intern/cycles/kernel/integrator/subsurface_disk.h
+++ b/intern/cycles/kernel/integrator/subsurface_disk.h
@@ -99,6 +99,10 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
   ray.dP = ray_dP;
   ray.dD = differential_zero_compact();
   ray.time = time;
+  ray.self.object = OBJECT_NONE;
+  ray.self.prim = PRIM_NONE;
+  ray.self.light_object = OBJECT_NONE;
+  ray.self.light_prim = OBJECT_NONE;
 
   /* Intersect with the same object. if multiple intersections are found it
    * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits. */
diff --git a/intern/cycles/kernel/integrator/subsurface_random_walk.h b/intern/cycles/kernel/integrator/subsurface_random_walk.h
index 7a8b467e199..993c54d9050 100644
--- a/intern/cycles/kernel/integrator/subsurface_random_walk.h
+++ b/intern/cycles/kernel/integrator/subsurface_random_walk.h
@@ -195,6 +195,7 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
   const float time = INTEGRATOR_STATE(state, ray, time);
   const float3 Ng = INTEGRATOR_STATE(state, subsurface, Ng);
   const int object = INTEGRATOR_STATE(state, isect, object);
+  const int prim = INTEGRATOR_STATE(state, isect, prim);
 
   /* Sample diffuse surface scatter into the object. */
   float3 D;
@@ -205,12 +206,16 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
   }
 
   /* Setup ray. */
-  ray.P = ray_offset(P, -Ng);
+  ray.P = P;
   ray.D = D;
   ray.t = FLT_MAX;
   ray.time = time;
   ray.dP = ray_dP;
   ray.dD = differential_zero_compact();
+  ray.self.object = object;
+  ray.self.prim = prim;
+  ray.self.light_object = OBJECT_NONE;
+  ray.self.light_prim = PRIM_NONE;
 
 #ifndef __KERNEL_GPU_RAYTRACING__
   /* Compute or fetch object transforms. */
@@ -377,7 +382,15 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
      * If yes, we will later use backwards guided sampling in order to have a decent
      * chance of connecting to it.
      * TODO: Maybe use less than 10 times the mean free path? */
-    ray.t = (bounce == 0) ? max(t, 10.0f / (min3(sigma_t))) : t;
+    if (bounce == 0) {
+      ray.t = max(t, 10.0f / (min3(sigma_t)));
+    }
+    else {
+      ray.t = t;
+      /* After the first bounce the object can intersect the same surface again */
+      ray.self.object = OBJECT_NONE;
+      ray.self.prim = PRIM_NONE;
+    }
     scene_intersect_local(kg, &ray, &ss_isect, object, NULL, 1);
     hit = (ss_isect.num_hits > 0);
 
@@ -408,13 +421,6 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
     if (hit) {
       t = ray.t;
     }
-    else if (bounce == 0) {
-      /* Restore original position if nothing was hit after the first bounce,
-       * without the ray_offset() that was added to avoid self-intersection.
-       * Otherwise if that offset is relatively large compared to the scattering
-       * radius, we never go back up high enough to exit the surface. */
-      ray.P = P;
-    }
 
     /* Advance to new scatter location. */
     ray.P += t * ray.D;
diff --git a/intern/cycles/kernel/light/light.h b/intern/cycles/kernel/light/light.h
index 6e445f862db..d05fe47cc2c 100644
--- a/intern/cycles/kernel/light/light.h
+++ b/intern/cycles/kernel/light/light.h
@@ -113,22 +113,30 @@ ccl_device_inline bool light_sample(KernelGlobals kg,
     ls->P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
 
     if (type == LIGHT_SPOT) {
-      ls->Ng = make_float3(klight->spot.dir[0], klight->spot.dir[1], klight->spot.dir[2]);
-      float radius = klight->spot.radius;
+      const float3 center = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+      const float radius = klight->spot.radius;
+      const float3 dir = make_float3(
+          klight->spot.dir[0], klight->spot.dir[1], klight->spot.dir[2]);
+      /* disk oriented normal */
+      const float3 lightN = normalize(P - center);
+      ls->P = center;
 
       if (radius > 0.0f)
-        /* sphere light */
-        ls->P += disk_light_sample(ls->Ng, randu, randv) * radius;
+        /* disk light */
+        ls->P += disk_light_sample(lightN, randu, randv) * radius;
+
+      const float invarea = klight->spot.invarea;
+      ls->pdf = invarea;
 
       ls->D = normalize_len(ls->P - P, &ls->t);
+      /* we set the light normal to the outgoing direction to support texturing */
+      ls->Ng = -ls->D;
 
-      float invarea = klight->spot.invarea;
       ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
-      ls->pdf = invarea;
 
       /* spot light attenuation */
       ls->eval_fac *= spot_light_attenuation(
-          ls->Ng, klight->spot.spot_angle, klight->spot.spot_smooth, -ls->D);
+          dir, klight->spot.spot_angle, klight->spot.spot_smooth, -ls->D);
       if (!in_volume_segment && ls->eval_fac == 0.0f) {
         return false;
       }
@@ -137,32 +145,33 @@ ccl_device_inline bool light_sample(KernelGlobals kg,
       ls->u = uv.x;
       ls->v = uv.y;
 
-      ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
+      ls->pdf *= lamp_light_pdf(kg, lightN, -ls->D, ls->t);
     }
     else if (type == LIGHT_POINT) {
       float3 center = make_float3(klight->co[0], klight->co[1], klight->co[2]);
       float radius = klight->spot.radius;
+      /* disk oriented normal */
+      const float3 lightN = normalize(P - center);
       ls->P = center;
-      float pdf = 1.0;
 
       if (radius > 0.0f) {
-        ls->Ng = normalize(P - center);
-        ls->P += disk_light_sample(ls->Ng, randu, randv) * radius;
-        pdf = klight->spot.invarea;
-        ls->D = normalize_len(ls->P - P, &ls->t);
-      }
-      else {
-        ls->Ng = normalize(P - center);
+        ls->P += disk_light_sample(lightN, randu, randv) * radius;
       }
+      ls->pdf = klight->spot.invarea;
 
       ls->D = normalize_len(ls->P - P, &ls->t);
-      ls->pdf = pdf;
+      /* we set the light normal to the outgoing direction to support texturing */
+      ls->Ng = -ls->D;
+
       ls->eval_fac = M_1_PI_F * 0.25f * klight->spot.invarea;
+      if (!in_volume_segment && ls->eval_fac == 0.0f) {
+        return false;
+      }
 
       float2 uv = map_to_sphere(ls->Ng);
       ls->u = uv.x;
       ls->v = uv.y;
-      ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
+      ls->pdf *= lamp_light_pdf(kg, lightN, -ls->D, ls->t);
     }
     else {
       /* area light */
@@ -263,14 +272,16 @@ ccl_device bool lights_intersect(KernelGlobals kg,
 
     if (type == LIGHT_SPOT) {
       /* Spot/Disk light. */
+      const float mis_ray_t = INTEGRATOR_STATE(state, path, mis_ray_t);
+      const float3 ray_P = ray->P - ray->D * mis_ray_t;
+
       const float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-      const float3 lightN = make_float3(
-          klight->spot.dir[0], klight->spot.dir[1], klight->spot.dir[2]);
       const float radius = klight->spot.radius;
       if (radius == 0.0f) {
         continue;
       }
-
+      /* disk oriented normal */
+      const float3 lightN = normalize(ray_P - lightP);
       /* One sided. */
       if (dot(ray->D, lightN) >= 0.0f) {
         continue;
@@ -292,9 +303,10 @@ ccl_device bool lights_intersect(KernelGlobals kg,
         continue;
       }
 
+      /* disk oriented normal */
+      const float3 lightN = normalize(ray_P - lightP);
       float3 P;
-      const float3 lsN = normalize(ray_P - lightP);
-      if (!ray_disk_intersect(ray->P, ray->D, ray->t, lightP, lsN, radius, &P, &t)) {
+      if (!ray_disk_intersect(ray->P, ray->D, ray->t, lightP, lightN, radius, &P, &t)) {
         continue;
       }
     }
@@ -418,8 +430,8 @@ ccl_device bool light_sample_from_intersection(KernelGlobals kg,
   LightType type = (LightType)klight->type;
   ls->type = type;
   ls->shader = klight->shader_id;
-  ls->object = PRIM_NONE;
-  ls->prim = PRIM_NONE;
+  ls->object = isect->object;
+  ls->prim = isect->prim;
   ls->lamp = lamp;
   /* todo: missing texture coordinates */
   ls->t = isect->t;
@@ -427,7 +439,12 @@ ccl_device bool light_sample_from_intersection(KernelGlobals kg,
   ls->D = ray_D;
 
   if (type == LIGHT_SPOT) {
-    ls->Ng = make_float3(klight->spot.dir[0], klight->spot.dir[1], klight->spot.dir[2]);
+    const float3 center = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+    const float3 dir = make_float3(klight->spot.dir[0], klight->spot.dir[1], klight->spot.dir[2]);
+    /* the normal of the oriented disk */
+    const float3 lightN = normalize(ray_P - center);
+    /* we set the light normal to the outgoing direction to support texturing*/
+    ls->Ng = -ls->D;
 
     float invarea = klight->spot.invarea;
     ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
@@ -435,7 +452,7 @@ ccl_device bool light_sample_from_intersection(KernelGlobals kg,
 
     /* spot light attenuation */
     ls->eval_fac *= spot_light_attenuation(
-        ls->Ng, klight->spot.spot_angle, klight->spot.spot_smooth, -ls->D);
+        dir, klight->spot.spot_angle, klight->spot.spot_smooth, -ls->D);
 
     if (ls->eval_fac == 0.0f) {
       return false;
@@ -447,23 +464,32 @@ ccl_device bool light_sample_from_intersection(KernelGlobals kg,
 
     /* compute pdf */
     if (ls->t != FLT_MAX)
-      ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
+      ls->pdf *= lamp_light_pdf(kg, lightN, -ls->D, ls->t);
+    else
+      ls->pdf = 0.f;
   }
   else if (type == LIGHT_POINT) {
-    float3 center = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+    const float3 center = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+    const float3 lighN = normalize(ray_P - center);
+
+    /* we set the light normal to the outgoing direction to support texturing*/
+    ls->Ng = -ls->D;
 
-    ls->Ng = normalize(ray_P - center);
     float invarea = klight->spot.invarea;
     ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
     ls->pdf = invarea;
 
+    if (ls->eval_fac == 0.0f) {
+      return false;
+    }
+
     float2 uv = map_to_sphere(ls->Ng);
     ls->u = uv.x;
     ls->v = uv.y;
 
     /* compute pdf */
     if (ls->t != FLT_MAX)
-      ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
+      ls->pdf *= lamp_light_pdf(kg, lighN, -ls->D, ls->t);
     else
       ls->pdf = 0.f;
   }
@@ -921,4 +947,4 @@ ccl_device_inline bool light_distribution_sample_new_position(KernelGlobals kg,
   }
 }
 
-CCL_NAMESPACE_END
+CCL_NAMESPACE_END
+\ No newline at end of file
diff --git a/intern/cycles/kernel/light/sample.h b/intern/cycles/kernel/light/sample.h
index 7dbc783b1bb..521ad2f7066 100644
--- a/intern/cycles/kernel/light/sample.h
+++ b/intern/cycles/kernel/light/sample.h
@@ -198,7 +198,7 @@ ccl_device_inline float3 shadow_ray_offset(KernelGlobals kg,
   float NL = dot(sd->N, L);
   bool transmit = (NL < 0.0f);
   float3 Ng = (transmit ? -sd->Ng : sd->Ng);
-  float3 P = ray_offset(sd->P, Ng);
+  float3 P = sd->P;
 
   if ((sd->type & PRIMITIVE_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
     const float offset_cutoff =
@@ -243,7 +243,7 @@ ccl_device_inline void shadow_ray_setup(ccl_private const ShaderData *ccl_restri
     }
     else {
       /* other lights, avoid self-intersection */
-      ray->D = ray_offset(ls->P, ls->Ng) - P;
+      ray->D = ls->P - P;
       ray->D = normalize_len(ray->D, &ray->t);
     }
   }
@@ -257,6 +257,12 @@ ccl_device_inline void shadow_ray_setup(ccl_private const ShaderData *ccl_restri
   ray->dP = differential_make_compact(sd->dP);
   ray->dD = differential_zero_compact();
   ray->time = sd->time;
+
+  /* Fill in intersection surface and light details. */
+  ray->self.prim = sd->prim;
+  ray->self.object = sd->object;
+  ray->self.light_prim = ls->prim;
+  ray->self.light_object = ls->object;
 }
 
 /* Create shadow ray towards light sample. */
diff --git a/intern/cycles/kernel/osl/services.cpp b/intern/cycles/kernel/osl/services.cpp
index a79fc323a13..d79e7dfa8a5 100644
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -116,6 +116,8 @@ ustring OSLRenderServices::u_curve_tangent_normal("geom:curve_tangent_normal");
 ustring OSLRenderServices::u_curve_random("geom:curve_random");
 ustring OSLRenderServices::u_is_point("geom:is_point");
 ustring OSLRenderServices::u_point_radius("geom:point_radius");
+ustring OSLRenderServices::u_point_position("geom:point_position");
+ustring OSLRenderServices::u_point_random("geom:point_random");
 ustring OSLRenderServices::u_normal_map_normal("geom:normal_map_normal");
 ustring OSLRenderServices::u_path_ray_length("path:ray_length");
 ustring OSLRenderServices::u_path_ray_depth("path:ray_depth");
@@ -999,6 +1001,10 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
     float3 f = curve_tangent_normal(kg, sd);
     return set_attribute_float3(f, type, derivatives, val);
   }
+  else if (name == u_curve_random) {
+    float f = curve_random(kg, sd);
+    return set_attribute_float(f, type, derivatives, val);
+  }
   /* point attributes */
   else if (name == u_is_point) {
     float f = (sd->type & PRIMITIVE_POINT) != 0;
@@ -1008,6 +1014,14 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
     float f = point_radius(kg, sd);
     return set_attribute_float(f, type, derivatives, val);
   }
+  else if (name == u_point_position) {
+    float3 f = point_position(kg, sd);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+  else if (name == u_point_random) {
+    float f = point_random(kg, sd);
+    return set_attribute_float(f, type, derivatives, val);
+  }
   else if (name == u_normal_map_normal) {
     if (sd->type & PRIMITIVE_TRIANGLE) {
       float3 f = triangle_smooth_normal_unnormalized(kg, sd, sd->Ng, sd->prim, sd->u, sd->v);
diff --git a/intern/cycles/kernel/osl/services.h b/intern/cycles/kernel/osl/services.h
index 9526c92b8fb..96c71297186 100644
--- a/intern/cycles/kernel/osl/services.h
+++ b/intern/cycles/kernel/osl/services.h
@@ -298,7 +298,9 @@ class OSLRenderServices : public OSL::RendererServices {
   static ustring u_curve_tangent_normal;
   static ustring u_curve_random;
   static ustring u_is_point;
+  static ustring u_point_position;
   static ustring u_point_radius;
+  static ustring u_point_random;
   static ustring u_normal_map_normal;
   static ustring u_path_ray_length;
   static ustring u_path_ray_depth;
diff --git a/intern/cycles/kernel/osl/shaders/CMakeLists.txt b/intern/cycles/kernel/osl/shaders/CMakeLists.txt
index 4cafdb2a6d7..16a9b1cc012 100644
--- a/intern/cycles/kernel/osl/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/shaders/CMakeLists.txt
@@ -49,6 +49,7 @@ set(SRC_OSL
   node_glossy_bsdf.osl
   node_gradient_texture.osl
   node_hair_info.osl
+  node_point_info.osl
   node_scatter_volume.osl
   node_absorption_volume.osl
   node_principled_volume.osl
diff --git a/intern/cycles/kernel/osl/shaders/node_point_info.osl b/intern/cycles/kernel/osl/shaders/node_point_info.osl
new file mode 100644
index 00000000000..58d8acbf269
--- /dev/null
+++ b/intern/cycles/kernel/osl/shaders/node_point_info.osl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2022 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdcycles.h"
+
+shader node_point_info(output point Position = point(0.0, 0.0, 0.0),
+                       output float Radius = 0.0,
+                       output float Random = 0.0)
+{
+  getattribute("geom:point_position", Position);
+  getattribute("geom:point_radius", Radius);
+  getattribute("geom:point_random", Random);
+}
diff --git a/intern/cycles/kernel/svm/ao.h b/intern/cycles/kernel/svm/ao.h
index 678f49c8ccd..dcb1a79717d 100644
--- a/intern/cycles/kernel/svm/ao.h
+++ b/intern/cycles/kernel/svm/ao.h
@@ -70,10 +70,14 @@ ccl_device float svm_ao(
 
     /* Create ray. */
     Ray ray;
-    ray.P = ray_offset(sd->P, N);
+    ray.P = sd->P;
     ray.D = D.x * T + D.y * B + D.z * N;
     ray.t = max_dist;
     ray.time = sd->time;
+    ray.self.object = sd->object;
+    ray.self.prim = sd->prim;
+    ray.self.light_object = OBJECT_NONE;
+    ray.self.light_prim = PRIM_NONE;
     ray.dP = differential_zero_compact();
     ray.dD = differential_zero_compact();
 
diff --git a/intern/cycles/kernel/svm/attribute.h b/intern/cycles/kernel/svm/attribute.h
index e9de0164c7a..17301028528 100644
--- a/intern/cycles/kernel/svm/attribute.h
+++ b/intern/cycles/kernel/svm/attribute.h
@@ -87,7 +87,9 @@ ccl_device_noinline void svm_node_attr(KernelGlobals kg,
   if (node.y == ATTR_STD_GENERATED && desc.element == ATTR_ELEMENT_NONE) {
     /* No generated attribute, fall back to object coordinates. */
     float3 f = sd->P;
-    object_inverse_position_transform(kg, sd, &f);
+    if (sd->object != OBJECT_NONE) {
+      object_inverse_position_transform(kg, sd, &f);
+    }
     if (type == NODE_ATTR_OUTPUT_FLOAT) {
       stack_store_float(stack, out_offset, average(f));
     }
@@ -179,7 +181,9 @@ ccl_device_noinline void svm_node_attr_bump_dx(KernelGlobals kg,
   if (node.y == ATTR_STD_GENERATED && desc.element == ATTR_ELEMENT_NONE) {
     /* No generated attribute, fall back to object coordinates. */
     float3 f = sd->P + sd->dP.dx;
-    object_inverse_position_transform(kg, sd, &f);
+    if (sd->object != OBJECT_NONE) {
+      object_inverse_position_transform(kg, sd, &f);
+    }
     if (type == NODE_ATTR_OUTPUT_FLOAT) {
       stack_store_float(stack, out_offset, average(f));
     }
@@ -275,7 +279,9 @@ ccl_device_noinline void svm_node_attr_bump_dy(KernelGlobals kg,
   if (node.y == ATTR_STD_GENERATED && desc.element == ATTR_ELEMENT_NONE) {
     /* No generated attribute, fall back to object coordinates. */
     float3 f = sd->P + sd->dP.dy;
-    object_inverse_position_transform(kg, sd, &f);
+    if (sd->object != OBJECT_NONE) {
+      object_inverse_position_transform(kg, sd, &f);
+    }
     if (type == NODE_ATTR_OUTPUT_FLOAT) {
       stack_store_float(stack, out_offset, average(f));
     }
diff --git a/intern/cycles/kernel/svm/bevel.h b/intern/cycles/kernel/svm/bevel.h
index 46dfb6631da..98b663299da 100644
--- a/intern/cycles/kernel/svm/bevel.h
+++ b/intern/cycles/kernel/svm/bevel.h
@@ -196,6 +196,10 @@ ccl_device float3 svm_bevel(
     ray.dP = differential_zero_compact();
     ray.dD = differential_zero_compact();
     ray.time = sd->time;
+    ray.self.object = OBJECT_NONE;
+    ray.self.prim = PRIM_NONE;
+    ray.self.light_object = OBJECT_NONE;
+    ray.self.light_prim = PRIM_NONE;
 
     /* Intersect with the same object. if multiple intersections are found it
      * will use at most LOCAL_MAX_HITS hits, a random subset of all hits. */
@@ -207,15 +211,24 @@ ccl_device float3 svm_bevel(
       /* Quickly retrieve P and Ng without setting up ShaderData. */
       float3 hit_P;
       if (sd->type == PRIMITIVE_TRIANGLE) {
-        hit_P = triangle_refine_local(
-            kg, sd, ray.P, ray.D, ray.t, isect.hits[hit].object, isect.hits[hit].prim);
+        hit_P = triangle_point_from_uv(kg,
+                                       sd,
+                                       isect.hits[hit].object,
+                                       isect.hits[hit].prim,
+                                       isect.hits[hit].u,
+                                       isect.hits[hit].v);
       }
 #  ifdef __OBJECT_MOTION__
       else if (sd->type == PRIMITIVE_MOTION_TRIANGLE) {
         float3 verts[3];
         motion_triangle_vertices(kg, sd->object, isect.hits[hit].prim, sd->time, verts);
-        hit_P = motion_triangle_refine_local(
-            kg, sd, ray.P, ray.D, ray.t, isect.hits[hit].object, isect.hits[hit].prim, verts);
+        hit_P = motion_triangle_point_from_uv(kg,
+                                              sd,
+                                              isect.hits[hit].object,
+                                              isect.hits[hit].prim,
+                                              isect.hits[hit].u,
+                                              isect.hits[hit].v,
+                                              verts);
       }
 #  endif /* __OBJECT_MOTION__ */
 
diff --git a/intern/cycles/kernel/svm/geometry.h b/intern/cycles/kernel/svm/geometry.h
index 2bac58b0aa2..225348b1ac2 100644
--- a/intern/cycles/kernel/svm/geometry.h
+++ b/intern/cycles/kernel/svm/geometry.h
@@ -242,13 +242,6 @@ ccl_device_noinline void svm_node_hair_info(KernelGlobals kg,
       stack_store_float(stack, out_offset, data);
       break;
     }
-#  if 0
-    case NODE_INFO_CURVE_FADE: {
-      data = sd->curve_transparency;
-      stack_store_float(stack, out_offset, data);
-      break;
-    }
-#  endif
     case NODE_INFO_CURVE_TANGENT_NORMAL: {
       data3 = curve_tangent_normal(kg, sd);
       stack_store_float3(stack, out_offset, data3);
@@ -258,4 +251,28 @@ ccl_device_noinline void svm_node_hair_info(KernelGlobals kg,
 }
 #endif
 
+#ifdef __POINTCLOUD__
+
+/* Point Info */
+
+ccl_device_noinline void svm_node_point_info(KernelGlobals kg,
+                                             ccl_private ShaderData *sd,
+                                             ccl_private float *stack,
+                                             uint type,
+                                             uint out_offset)
+{
+  switch (type) {
+    case NODE_INFO_POINT_POSITION:
+      stack_store_float3(stack, out_offset, point_position(kg, sd));
+      break;
+    case NODE_INFO_POINT_RADIUS:
+      stack_store_float(stack, out_offset, point_radius(kg, sd));
+      break;
+    case NODE_INFO_POINT_RANDOM:
+      break; /* handled as attribute */
+  }
+}
+
+#endif
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index b226bc66771..35d4c3f2055 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -454,13 +454,14 @@ ccl_device void svm_eval_nodes(KernelGlobals kg,
         break;
 #if defined(__HAIR__)
       case NODE_HAIR_INFO:
-        IF_KERNEL_NODES_FEATURE(HAIR)
-        {
-          svm_node_hair_info(kg, sd, stack, node.y, node.z);
-        }
+        svm_node_hair_info(kg, sd, stack, node.y, node.z);
+        break;
+#endif
+#if defined(__POINTCLOUD__)
+      case NODE_POINT_INFO:
+        svm_node_point_info(kg, sd, stack, node.y, node.z);
         break;
 #endif
-
       case NODE_TEXTURE_MAPPING:
         offset = svm_node_texture_mapping(kg, sd, stack, node.y, node.z, offset);
         break;
diff --git a/intern/cycles/kernel/svm/types.h b/intern/cycles/kernel/svm/types.h
index dd1b1f9bc28..16e9fd8862a 100644
--- a/intern/cycles/kernel/svm/types.h
+++ b/intern/cycles/kernel/svm/types.h
@@ -81,6 +81,7 @@ typedef enum ShaderNodeType {
   NODE_OBJECT_INFO,
   NODE_PARTICLE_INFO,
   NODE_HAIR_INFO,
+  NODE_POINT_INFO,
   NODE_TEXTURE_MAPPING,
   NODE_MAPPING,
   NODE_MIN_MAX,
@@ -176,12 +177,16 @@ typedef enum NodeHairInfo {
   NODE_INFO_CURVE_INTERCEPT,
   NODE_INFO_CURVE_LENGTH,
   NODE_INFO_CURVE_THICKNESS,
-  /* Fade for minimum hair width transiency. */
-  // NODE_INFO_CURVE_FADE,
   NODE_INFO_CURVE_TANGENT_NORMAL,
   NODE_INFO_CURVE_RANDOM,
 } NodeHairInfo;
 
+typedef enum NodePointInfo {
+  NODE_INFO_POINT_POSITION,
+  NODE_INFO_POINT_RADIUS,
+  NODE_INFO_POINT_RANDOM,
+} NodePointInfo;
+
 typedef enum NodeLightPath {
   NODE_LP_camera = 0,
   NODE_LP_shadow,
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index 5d41abb53c4..d4cb22d4af4 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -512,12 +512,21 @@ typedef struct differential {
 
 /* Ray */
 
+typedef struct RaySelfPrimitives {
+  int prim;         /* Primitive the ray is starting from */
+  int object;       /* Instance prim is a part of */
+  int light_prim;   /* Light primitive */
+  int light_object; /* Light object */
+} RaySelfPrimitives;
+
 typedef struct Ray {
   float3 P;   /* origin */
   float3 D;   /* direction */
   float t;    /* length of the ray */
   float time; /* time (for motion blur) */
 
+  RaySelfPrimitives self;
+
 #ifdef __RAY_DIFFERENTIALS__
   float dP;
   float dD;
@@ -1565,21 +1574,21 @@ enum KernelFeatureFlag : uint32_t {
   KERNEL_FEATURE_NODE_BSDF = (1U << 0U),
   KERNEL_FEATURE_NODE_EMISSION = (1U << 1U),
   KERNEL_FEATURE_NODE_VOLUME = (1U << 2U),
-  KERNEL_FEATURE_NODE_HAIR = (1U << 3U),
-  KERNEL_FEATURE_NODE_BUMP = (1U << 4U),
-  KERNEL_FEATURE_NODE_BUMP_STATE = (1U << 5U),
-  KERNEL_FEATURE_NODE_VORONOI_EXTRA = (1U << 6U),
-  KERNEL_FEATURE_NODE_RAYTRACE = (1U << 7U),
-  KERNEL_FEATURE_NODE_AOV = (1U << 8U),
-  KERNEL_FEATURE_NODE_LIGHT_PATH = (1U << 9U),
+  KERNEL_FEATURE_NODE_BUMP = (1U << 3U),
+  KERNEL_FEATURE_NODE_BUMP_STATE = (1U << 4U),
+  KERNEL_FEATURE_NODE_VORONOI_EXTRA = (1U << 5U),
+  KERNEL_FEATURE_NODE_RAYTRACE = (1U << 6U),
+  KERNEL_FEATURE_NODE_AOV = (1U << 7U),
+  KERNEL_FEATURE_NODE_LIGHT_PATH = (1U << 8U),
 
   /* Use denoising kernels and output denoising passes. */
-  KERNEL_FEATURE_DENOISING = (1U << 10U),
+  KERNEL_FEATURE_DENOISING = (1U << 9U),
 
   /* Use path tracing kernels. */
-  KERNEL_FEATURE_PATH_TRACING = (1U << 11U),
+  KERNEL_FEATURE_PATH_TRACING = (1U << 10U),
 
   /* BVH/sampling kernel features. */
+  KERNEL_FEATURE_POINTCLOUD = (1U << 11U),
   KERNEL_FEATURE_HAIR = (1U << 12U),
   KERNEL_FEATURE_HAIR_THICK = (1U << 13U),
   KERNEL_FEATURE_OBJECT_MOTION = (1U << 14U),
@@ -1616,9 +1625,6 @@ enum KernelFeatureFlag : uint32_t {
   KERNEL_FEATURE_AO_PASS = (1U << 25U),
   KERNEL_FEATURE_AO_ADDITIVE = (1U << 26U),
   KERNEL_FEATURE_AO = (KERNEL_FEATURE_AO_PASS | KERNEL_FEATURE_AO_ADDITIVE),
-
-  /* Point clouds. */
-  KERNEL_FEATURE_POINTCLOUD = (1U << 27U),
 };
 
 /* Shader node feature mask, to specialize shader evaluation for kernels. */
@@ -1628,7 +1634,7 @@ enum KernelFeatureFlag : uint32_t {
    KERNEL_FEATURE_NODE_LIGHT_PATH)
 #define KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW \
   (KERNEL_FEATURE_NODE_BSDF | KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VOLUME | \
-   KERNEL_FEATURE_NODE_HAIR | KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE | \
+   KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE | \
    KERNEL_FEATURE_NODE_VORONOI_EXTRA | KERNEL_FEATURE_NODE_LIGHT_PATH)
 #define KERNEL_FEATURE_NODE_MASK_SURFACE \
   (KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW | KERNEL_FEATURE_NODE_RAYTRACE | \
diff --git a/intern/cycles/scene/colorspace.cpp b/intern/cycles/scene/colorspace.cpp
index c1a308fcbaa..f0b7eb724de 100644
--- a/intern/cycles/scene/colorspace.cpp
+++ b/intern/cycles/scene/colorspace.cpp
@@ -263,7 +263,9 @@ template<typename T> inline void cast_from_float4(T *data, float4 value)
 
 /* Slower versions for other all data types, which needs to convert to float and back. */
 template<typename T, bool compress_as_srgb = false>
-inline void processor_apply_pixels(const OCIO::Processor *processor, T *pixels, size_t num_pixels)
+inline void processor_apply_pixels_rgba(const OCIO::Processor *processor,
+                                        T *pixels,
+                                        size_t num_pixels)
 {
   /* TODO: implement faster version for when we know the conversion
    * is a simple matrix transform between linear spaces. In that case
@@ -310,25 +312,79 @@ inline void processor_apply_pixels(const OCIO::Processor *processor, T *pixels,
     }
   }
 }
+
+template<typename T, bool compress_as_srgb = false>
+inline void processor_apply_pixels_grayscale(const OCIO::Processor *processor,
+                                             T *pixels,
+                                             size_t num_pixels)
+{
+  OCIO::ConstCPUProcessorRcPtr device_processor = processor->getDefaultCPUProcessor();
+
+  /* Process large images in chunks to keep temporary memory requirement down. */
+  const size_t chunk_size = std::min((size_t)(16 * 1024 * 1024), num_pixels);
+  vector<float> float_pixels(chunk_size * 3);
+
+  for (size_t j = 0; j < num_pixels; j += chunk_size) {
+    size_t width = std::min(chunk_size, num_pixels - j);
+
+    /* Convert to 3 channels, since that's the minimum required by OpenColorIO. */
+    {
+      const T *pixel = pixels + j;
+      float *fpixel = float_pixels.data();
+      for (size_t i = 0; i < width; i++, pixel++, fpixel += 3) {
+        const float f = util_image_cast_to_float<T>(*pixel);
+        fpixel[0] = f;
+        fpixel[1] = f;
+        fpixel[2] = f;
+      }
+    }
+
+    OCIO::PackedImageDesc desc((float *)float_pixels.data(), width, 1, 3);
+    device_processor->apply(desc);
+
+    {
+      T *pixel = pixels + j;
+      const float *fpixel = float_pixels.data();
+      for (size_t i = 0; i < width; i++, pixel++, fpixel += 3) {
+        float f = average(make_float3(fpixel[0], fpixel[1], fpixel[2]));
+        if (compress_as_srgb) {
+          f = color_linear_to_srgb(f);
+        }
+        *pixel = util_image_cast_from_float<T>(f);
+      }
+    }
+  }
+}
+
 #endif
 
 template<typename T>
-void ColorSpaceManager::to_scene_linear(ustring colorspace,
-                                        T *pixels,
-                                        size_t num_pixels,
-                                        bool compress_as_srgb)
+void ColorSpaceManager::to_scene_linear(
+    ustring colorspace, T *pixels, size_t num_pixels, bool is_rgba, bool compress_as_srgb)
 {
 #ifdef WITH_OCIO
   const OCIO::Processor *processor = (const OCIO::Processor *)get_processor(colorspace);
 
   if (processor) {
-    if (compress_as_srgb) {
-      /* Compress output as sRGB. */
-      processor_apply_pixels<T, true>(processor, pixels, num_pixels);
+    if (is_rgba) {
+      if (compress_as_srgb) {
+        /* Compress output as sRGB. */
+        processor_apply_pixels_rgba<T, true>(processor, pixels, num_pixels);
+      }
+      else {
+        /* Write output as scene linear directly. */
+        processor_apply_pixels_rgba<T>(processor, pixels, num_pixels);
+      }
     }
     else {
-      /* Write output as scene linear directly. */
-      processor_apply_pixels<T>(processor, pixels, num_pixels);
+      if (compress_as_srgb) {
+        /* Compress output as sRGB. */
+        processor_apply_pixels_grayscale<T, true>(processor, pixels, num_pixels);
+      }
+      else {
+        /* Write output as scene linear directly. */
+        processor_apply_pixels_grayscale<T>(processor, pixels, num_pixels);
+      }
     }
   }
 #else
@@ -348,6 +404,11 @@ void ColorSpaceManager::to_scene_linear(ColorSpaceProcessor *processor_,
 
   if (processor) {
     OCIO::ConstCPUProcessorRcPtr device_processor = processor->getDefaultCPUProcessor();
+    if (channels == 1) {
+      float3 rgb = make_float3(pixel[0], pixel[0], pixel[0]);
+      device_processor->applyRGB(&rgb.x);
+      pixel[0] = average(rgb);
+    }
     if (channels == 3) {
       device_processor->applyRGB(pixel);
     }
@@ -390,9 +451,9 @@ void ColorSpaceManager::free_memory()
 }
 
 /* Template instantiations so we don't have to inline functions. */
-template void ColorSpaceManager::to_scene_linear(ustring, uchar *, size_t, bool);
-template void ColorSpaceManager::to_scene_linear(ustring, ushort *, size_t, bool);
-template void ColorSpaceManager::to_scene_linear(ustring, half *, size_t, bool);
-template void ColorSpaceManager::to_scene_linear(ustring, float *, size_t, bool);
+template void ColorSpaceManager::to_scene_linear(ustring, uchar *, size_t, bool, bool);
+template void ColorSpaceManager::to_scene_linear(ustring, ushort *, size_t, bool, bool);
+template void ColorSpaceManager::to_scene_linear(ustring, half *, size_t, bool, bool);
+template void ColorSpaceManager::to_scene_linear(ustring, float *, size_t, bool, bool);
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/scene/colorspace.h b/intern/cycles/scene/colorspace.h
index 7f7bc604f07..f02c1231a44 100644
--- a/intern/cycles/scene/colorspace.h
+++ b/intern/cycles/scene/colorspace.h
@@ -43,10 +43,8 @@ class ColorSpaceManager {
   /* Convert pixels in the specified colorspace to scene linear color for
    * rendering. Must be a colorspace returned from detect_known_colorspace. */
   template<typename T>
-  static void to_scene_linear(ustring colorspace,
-                              T *pixels,
-                              size_t num_pixels,
-                              bool compress_as_srgb);
+  static void to_scene_linear(
+      ustring colorspace, T *pixels, size_t num_pixels, bool is_rgba, bool compress_as_srgb);
 
   /* Efficiently convert pixels to scene linear colorspace at render time,
    * for OSL where the image texture cache contains original pixels. The
diff --git a/intern/cycles/scene/constant_fold.cpp b/intern/cycles/scene/constant_fold.cpp
index a5fb68bf229..e9fb3426b70 100644
--- a/intern/cycles/scene/constant_fold.cpp
+++ b/intern/cycles/scene/constant_fold.cpp
@@ -441,9 +441,13 @@ void ConstantFolder::fold_mapping(NodeMappingType type) const
   if (is_zero(scale_in)) {
     make_zero();
   }
-  else if ((is_zero(location_in) || type == NODE_MAPPING_TYPE_VECTOR ||
-            type == NODE_MAPPING_TYPE_NORMAL) &&
-           is_zero(rotation_in) && is_one(scale_in)) {
+  else if (
+      /* Can't constant fold since we always need to normalize the output. */
+      (type != NODE_MAPPING_TYPE_NORMAL) &&
+      /* Check all use values are zero, note location is not used by vector and normal types. */
+      (is_zero(location_in) || type == NODE_MAPPING_TYPE_VECTOR ||
+       type == NODE_MAPPING_TYPE_NORMAL) &&
+      is_zero(rotation_in) && is_one(scale_in)) {
     try_bypass_or_make_constant(vector_in);
   }
 }
diff --git a/intern/cycles/scene/geometry.cpp b/intern/cycles/scene/geometry.cpp
index 49d18d00dd7..90f1e1cb021 100644
--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
@@ -236,6 +236,7 @@ void Geometry::compute_bvh(
 
       BVHParams bparams;
       bparams.use_spatial_split = params->use_bvh_spatial_split;
+      bparams.use_compact_structure = params->use_bvh_compact_structure;
       bparams.bvh_layout = bvh_layout;
       bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
                                     params->use_bvh_unaligned_nodes;
diff --git a/intern/cycles/scene/image.cpp b/intern/cycles/scene/image.cpp
index 3595ca55a46..7aad46d253c 100644
--- a/intern/cycles/scene/image.cpp
+++ b/intern/cycles/scene/image.cpp
@@ -576,13 +576,13 @@ bool ImageManager::file_load_image(Image *img, int texture_limit)
         pixels[i * 4 + 3] = one;
       }
     }
+  }
 
-    if (img->metadata.colorspace != u_colorspace_raw &&
-        img->metadata.colorspace != u_colorspace_srgb) {
-      /* Convert to scene linear. */
-      ColorSpaceManager::to_scene_linear(
-          img->metadata.colorspace, pixels, num_pixels, img->metadata.compress_as_srgb);
-    }
+  if (img->metadata.colorspace != u_colorspace_raw &&
+      img->metadata.colorspace != u_colorspace_srgb) {
+    /* Convert to scene linear. */
+    ColorSpaceManager::to_scene_linear(
+        img->metadata.colorspace, pixels, num_pixels, is_rgba, img->metadata.compress_as_srgb);
   }
 
   /* Make sure we don't have buggy values. */
@@ -891,6 +891,10 @@ void ImageManager::device_free(Device *device)
 void ImageManager::collect_statistics(RenderStats *stats)
 {
   foreach (const Image *image, images) {
+    if (!image) {
+      /* Image may have been freed due to lack of users. */
+      continue;
+    }
     stats->image.textures.add_entry(
         NamedSizeEntry(image->loader->name(), image->mem->memory_size()));
   }
diff --git a/intern/cycles/scene/scene.cpp b/intern/cycles/scene/scene.cpp
index 1963ebbbb19..b5b8eee24a7 100644
--- a/intern/cycles/scene/scene.cpp
+++ b/intern/cycles/scene/scene.cpp
@@ -570,7 +570,6 @@ static void log_kernel_features(const uint features)
           << "\n";
   VLOG(2) << "Use Emission " << string_from_bool(features & KERNEL_FEATURE_NODE_EMISSION) << "\n";
   VLOG(2) << "Use Volume " << string_from_bool(features & KERNEL_FEATURE_NODE_VOLUME) << "\n";
-  VLOG(2) << "Use Hair " << string_from_bool(features & KERNEL_FEATURE_NODE_HAIR) << "\n";
   VLOG(2) << "Use Bump " << string_from_bool(features & KERNEL_FEATURE_NODE_BUMP) << "\n";
   VLOG(2) << "Use Voronoi " << string_from_bool(features & KERNEL_FEATURE_NODE_VORONOI_EXTRA)
           << "\n";
diff --git a/intern/cycles/scene/scene.h b/intern/cycles/scene/scene.h
index ec935b41be6..77268837070 100644
--- a/intern/cycles/scene/scene.h
+++ b/intern/cycles/scene/scene.h
@@ -160,6 +160,7 @@ class SceneParams {
 
   BVHType bvh_type;
   bool use_bvh_spatial_split;
+  bool use_bvh_compact_structure;
   bool use_bvh_unaligned_nodes;
   int num_bvh_time_steps;
   int hair_subdivisions;
@@ -174,6 +175,7 @@ class SceneParams {
     bvh_layout = BVH_LAYOUT_BVH2;
     bvh_type = BVH_TYPE_DYNAMIC;
     use_bvh_spatial_split = false;
+    use_bvh_compact_structure = true;
     use_bvh_unaligned_nodes = true;
     num_bvh_time_steps = 0;
     hair_subdivisions = 3;
@@ -187,6 +189,7 @@ class SceneParams {
     return !(shadingsystem == params.shadingsystem && bvh_layout == params.bvh_layout &&
              bvh_type == params.bvh_type &&
              use_bvh_spatial_split == params.use_bvh_spatial_split &&
+             use_bvh_compact_structure == params.use_bvh_compact_structure &&
              use_bvh_unaligned_nodes == params.use_bvh_unaligned_nodes &&
              num_bvh_time_steps == params.num_bvh_time_steps &&
              hair_subdivisions == params.hair_subdivisions && hair_shape == params.hair_shape &&
diff --git a/intern/cycles/scene/shader_nodes.cpp b/intern/cycles/scene/shader_nodes.cpp
index e8316ad41b4..34675be8e80 100644
--- a/intern/cycles/scene/shader_nodes.cpp
+++ b/intern/cycles/scene/shader_nodes.cpp
@@ -32,6 +32,7 @@
 #include "util/color.h"
 #include "util/foreach.h"
 #include "util/log.h"
+#include "util/string.h"
 #include "util/transform.h"
 
 #include "kernel/tables.h"
@@ -462,8 +463,12 @@ void ImageTextureNode::compile(OSLCompiler &compiler)
   const ustring known_colorspace = metadata.colorspace;
 
   if (handle.svm_slot() == -1) {
+    /* OIIO currently does not support <UVTILE> substitutions natively. Replace with a format they
+     * understand. */
+    std::string osl_filename = filename.string();
+    string_replace(osl_filename, "<UVTILE>", "<U>_<V>");
     compiler.parameter_texture(
-        "filename", filename, compress_as_srgb ? u_colorspace_raw : known_colorspace);
+        "filename", ustring(osl_filename), compress_as_srgb ? u_colorspace_raw : known_colorspace);
   }
   else {
     compiler.parameter_texture("filename", handle.svm_slot());
@@ -472,7 +477,8 @@ void ImageTextureNode::compile(OSLCompiler &compiler)
   const bool unassociate_alpha = !(ColorSpaceManager::colorspace_is_data(colorspace) ||
                                    alpha_type == IMAGE_ALPHA_CHANNEL_PACKED ||
                                    alpha_type == IMAGE_ALPHA_IGNORE);
-  const bool is_tiled = (filename.find("<UDIM>") != string::npos);
+  const bool is_tiled = (filename.find("<UDIM>") != string::npos ||
+                         filename.find("<UVTILE>") != string::npos);
 
   compiler.parameter(this, "projection");
   compiler.parameter(this, "projection_blend");
@@ -4388,9 +4394,6 @@ NODE_DEFINE(HairInfoNode)
   SOCKET_OUT_FLOAT(size, "Length");
   SOCKET_OUT_FLOAT(thickness, "Thickness");
   SOCKET_OUT_NORMAL(tangent_normal, "Tangent Normal");
-#if 0 /* Output for minimum hair width transparency - deactivated. */
-  SOCKET_OUT_FLOAT(fade, "Fade");
-#endif
   SOCKET_OUT_FLOAT(index, "Random");
 
   return type;
@@ -4448,12 +4451,7 @@ void HairInfoNode::compile(SVMCompiler &compiler)
   if (!out->links.empty()) {
     compiler.add_node(NODE_HAIR_INFO, NODE_INFO_CURVE_TANGENT_NORMAL, compiler.stack_assign(out));
   }
-#if 0
-  out = output("Fade");
-  if(!out->links.empty()) {
-    compiler.add_node(NODE_HAIR_INFO, NODE_INFO_CURVE_FADE, compiler.stack_assign(out));
-  }
-#endif
+
   out = output("Random");
   if (!out->links.empty()) {
     int attr = compiler.attribute(ATTR_STD_CURVE_RANDOM);
@@ -4466,6 +4464,59 @@ void HairInfoNode::compile(OSLCompiler &compiler)
   compiler.add(this, "node_hair_info");
 }
 
+/* Point Info */
+
+NODE_DEFINE(PointInfoNode)
+{
+  NodeType *type = NodeType::add("point_info", create, NodeType::SHADER);
+
+  SOCKET_OUT_POINT(position, "Position");
+  SOCKET_OUT_FLOAT(radius, "Radius");
+  SOCKET_OUT_FLOAT(random, "Random");
+
+  return type;
+}
+
+PointInfoNode::PointInfoNode() : ShaderNode(get_node_type())
+{
+}
+
+void PointInfoNode::attributes(Shader *shader, AttributeRequestSet *attributes)
+{
+  if (shader->has_surface_link()) {
+    if (!output("Random")->links.empty())
+      attributes->add(ATTR_STD_POINT_RANDOM);
+  }
+
+  ShaderNode::attributes(shader, attributes);
+}
+
+void PointInfoNode::compile(SVMCompiler &compiler)
+{
+  ShaderOutput *out;
+
+  out = output("Position");
+  if (!out->links.empty()) {
+    compiler.add_node(NODE_POINT_INFO, NODE_INFO_POINT_POSITION, compiler.stack_assign(out));
+  }
+
+  out = output("Radius");
+  if (!out->links.empty()) {
+    compiler.add_node(NODE_POINT_INFO, NODE_INFO_POINT_RADIUS, compiler.stack_assign(out));
+  }
+
+  out = output("Random");
+  if (!out->links.empty()) {
+    int attr = compiler.attribute(ATTR_STD_POINT_RANDOM);
+    compiler.add_node(NODE_ATTR, attr, compiler.stack_assign(out), NODE_ATTR_OUTPUT_FLOAT);
+  }
+}
+
+void PointInfoNode::compile(OSLCompiler &compiler)
+{
+  compiler.add(this, "node_point_info");
+}
+
 /* Volume Info */
 
 NODE_DEFINE(VolumeInfoNode)
diff --git a/intern/cycles/scene/shader_nodes.h b/intern/cycles/scene/shader_nodes.h
index 0faefd3041f..a8d5bdcf157 100644
--- a/intern/cycles/scene/shader_nodes.h
+++ b/intern/cycles/scene/shader_nodes.h
@@ -1005,9 +1005,20 @@ class HairInfoNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_feature()
+};
+
+class PointInfoNode : public ShaderNode {
+ public:
+  SHADER_NODE_CLASS(PointInfoNode)
+
+  void attributes(Shader *shader, AttributeRequestSet *attributes);
+  bool has_attribute_dependency()
+  {
+    return true;
+  }
+  bool has_spatial_varying()
   {
-    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_HAIR;
+    return true;
   }
 };
 
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index 18b60b70a4b..605a19aaef0 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -401,7 +401,7 @@ ccl_device_inline float fractf(float x)
   return x - floorf(x);
 }
 
-/* Adapted from godot-engine math_funcs.h. */
+/* Adapted from `godot-engine` math_funcs.h. */
 ccl_device_inline float wrapf(float value, float max, float min)
 {
   float range = max - min;