9 files changed, 303 insertions, 84 deletions
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 0cc419c7025..def35573ec3 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -830,6 +830,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     debug_use_cuda_split_kernel: BoolProperty(name="Split Kernel", default=False)
 
     debug_optix_cuda_streams: IntProperty(name="CUDA Streams", default=1, min=1)
+    debug_optix_curves_api: BoolProperty(name="Native OptiX Curve Primitive", default=False)
 
     debug_opencl_kernel_type: EnumProperty(
         name="OpenCL Kernel Type",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index bf81a360020..cce3f805b39 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -2031,6 +2031,7 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
         col = layout.column()
         col.label(text="OptiX Flags:")
         col.prop(cscene, "debug_optix_cuda_streams")
+        col.prop(cscene, "debug_optix_curves_api")
 
         col.separator()
 
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 3e595c3ee52..25c77b74ce3 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -92,6 +92,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
   flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
   /* Synchronize OptiX flags. */
   flags.optix.cuda_streams = get_int(cscene, "debug_optix_cuda_streams");
+  flags.optix.curves_api = get_boolean(cscene, "debug_optix_curves_api");
   /* Synchronize OpenCL device type. */
   switch (get_enum(cscene, "debug_opencl_device_type")) {
     case 0:
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index ececca3df53..266222c74c5 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -131,8 +131,12 @@ class OptiXDevice : public CUDADevice {
     PG_RGEN,
     PG_MISS,
     PG_HITD,  // Default hit group
-    PG_HITL,  // __BVH_LOCAL__ hit group
     PG_HITS,  // __SHADOW_RECORD_ALL__ hit group
+    PG_HITL,  // __BVH_LOCAL__ hit group (only used for triangles)
+#  if OPTIX_ABI_VERSION >= 36
+    PG_HITD_MOTION,
+    PG_HITS_MOTION,
+#  endif
 #  ifdef WITH_CYCLES_DEBUG
     PG_EXCP,
 #  endif
@@ -177,6 +181,7 @@ class OptiXDevice : public CUDADevice {
   OptixDeviceContext context = NULL;
 
   OptixModule optix_module = NULL;  // All necessary OptiX kernels are in one module
+  OptixModule builtin_modules[2] = {};
   OptixPipeline pipelines[NUM_PIPELINES] = {};
 
   bool motion_blur = false;
@@ -264,6 +269,9 @@ class OptiXDevice : public CUDADevice {
     // Unload modules
     if (optix_module != NULL)
       optixModuleDestroy(optix_module);
+    for (unsigned int i = 0; i < 2; ++i)
+      if (builtin_modules[i] != NULL)
+        optixModuleDestroy(builtin_modules[i]);
     for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
       if (pipelines[i] != NULL)
         optixPipelineDestroy(pipelines[i]);
@@ -338,6 +346,12 @@ class OptiXDevice : public CUDADevice {
       optixModuleDestroy(optix_module);
       optix_module = NULL;
     }
+    for (unsigned int i = 0; i < 2; ++i) {
+      if (builtin_modules[i] != NULL) {
+        optixModuleDestroy(builtin_modules[i]);
+        builtin_modules[i] = NULL;
+      }
+    }
     for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
       if (pipelines[i] != NULL) {
         optixPipelineDestroy(pipelines[i]);
@@ -369,6 +383,18 @@ class OptiXDevice : public CUDADevice {
 #  endif
     pipeline_options.pipelineLaunchParamsVariableName = "__params";  // See kernel_globals.h
 
+#  if OPTIX_ABI_VERSION >= 36
+    pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
+    if (requested_features.use_hair) {
+      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
+        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+      }
+      else {
+        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+      }
+    }
+#  endif
+
     // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
     // This is necessary since objects may be reported to have motion if the Vector pass is
     // active, but may still need to be rendered without motion blur if that isn't active as well
@@ -442,6 +468,34 @@ class OptiXDevice : public CUDADevice {
         group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
         group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
       }
+
+#  if OPTIX_ABI_VERSION >= 36
+      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
+        OptixBuiltinISOptions builtin_options;
+        builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+        builtin_options.usesMotionBlur = false;
+
+        check_result_optix_ret(optixBuiltinISModuleGet(
+            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
+
+        group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
+        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
+        group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
+        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
+
+        if (motion_blur) {
+          builtin_options.usesMotionBlur = true;
+
+          check_result_optix_ret(optixBuiltinISModuleGet(
+              context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
+
+          group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
+          group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
+          group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
+          group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
+        }
+      }
+#  endif
     }
 
     if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
@@ -493,8 +547,14 @@ class OptiXDevice : public CUDADevice {
     unsigned int trace_css = stack_size[PG_HITD].cssCH;
     // This is based on the maximum of closest-hit and any-hit/intersection programs
     trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
-    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
     trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+#  if OPTIX_ABI_VERSION >= 36
+    trace_css = std::max(trace_css,
+                         stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
+    trace_css = std::max(trace_css,
+                         stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+#  endif
 
     OptixPipelineLinkOptions link_options;
     link_options.maxTraceDepth = 1;
@@ -503,17 +563,23 @@ class OptiXDevice : public CUDADevice {
 #  else
     link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
 #  endif
-    link_options.overrideUsesMotionBlur = pipeline_options.usesMotionBlur;
+#  if OPTIX_ABI_VERSION < 24
+    link_options.overrideUsesMotionBlur = motion_blur;
+#  endif
 
     {  // Create path tracing pipeline
       OptixProgramGroup pipeline_groups[] = {
-          groups[PG_RGEN],
-          groups[PG_MISS],
-          groups[PG_HITD],
-          groups[PG_HITS],
-          groups[PG_HITL],
+        groups[PG_RGEN],
+        groups[PG_MISS],
+        groups[PG_HITD],
+        groups[PG_HITS],
+        groups[PG_HITL],
+#  if OPTIX_ABI_VERSION >= 36
+        groups[PG_HITD_MOTION],
+        groups[PG_HITS_MOTION],
+#  endif
 #  ifdef WITH_CYCLES_DEBUG
-          groups[PG_EXCP],
+        groups[PG_EXCP],
 #  endif
       };
       check_result_optix_ret(
@@ -530,8 +596,8 @@ class OptiXDevice : public CUDADevice {
       const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
 
       // Set stack size depending on pipeline options
-      check_result_optix_ret(optixPipelineSetStackSize(
-          pipelines[PIP_PATH_TRACE], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2)));
+      check_result_optix_ret(
+          optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE], 0, 0, css, (motion_blur ? 3 : 2)));
     }
 
     // Only need to create shader evaluation pipeline if one of these features is used:
@@ -541,15 +607,19 @@ class OptiXDevice : public CUDADevice {
 
     if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
       OptixProgramGroup pipeline_groups[] = {
-          groups[PG_BAKE],
-          groups[PG_DISP],
-          groups[PG_BACK],
-          groups[PG_MISS],
-          groups[PG_HITD],
-          groups[PG_HITS],
-          groups[PG_HITL],
+        groups[PG_BAKE],
+        groups[PG_DISP],
+        groups[PG_BACK],
+        groups[PG_MISS],
+        groups[PG_HITD],
+        groups[PG_HITS],
+        groups[PG_HITL],
+#  if OPTIX_ABI_VERSION >= 36
+        groups[PG_HITD_MOTION],
+        groups[PG_HITS_MOTION],
+#  endif
 #  ifdef WITH_CYCLES_DEBUG
-          groups[PG_EXCP],
+        groups[PG_EXCP],
 #  endif
       };
       check_result_optix_ret(
@@ -672,7 +742,11 @@ class OptiXDevice : public CUDADevice {
       sbt_params.missRecordCount = 1;
       sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
       sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITL, PG_HITS
+#  if OPTIX_ABI_VERSION >= 36
+      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
+#  else
+      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
+#  endif
 
       // Launch the ray generation program
       check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
@@ -836,7 +910,9 @@ class OptiXDevice : public CUDADevice {
         assert(task.denoising.optix_input_passes >= 1 && task.denoising.optix_input_passes <= 3);
         denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
             OPTIX_DENOISER_INPUT_RGB + (task.denoising.optix_input_passes - 1));
+#  if OPTIX_ABI_VERSION < 28
         denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
+#  endif
         check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
         check_result_optix_ret(
             optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
@@ -849,7 +925,11 @@ class OptiXDevice : public CUDADevice {
       check_result_optix_ret(
           optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
 
+#  if OPTIX_ABI_VERSION < 28
       const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
+#  else
+      const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
+#  endif
       const size_t scratch_offset = sizes.stateSizeInBytes;
 
       // Allocate denoiser state if tile size has changed since last setup
@@ -993,7 +1073,11 @@ class OptiXDevice : public CUDADevice {
       sbt_params.missRecordCount = 1;
       sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
       sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITL, PG_HITS
+#  if OPTIX_ABI_VERSION >= 36
+      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
+#  else
+      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
+#  endif
 
       check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
                                      cuda_stream[thread_index],
@@ -1070,7 +1154,7 @@ class OptiXDevice : public CUDADevice {
                                            &build_input,
                                            1,
                                            temp_mem.device_pointer,
-                                           temp_mem.device_size,
+                                           sizes.tempSizeInBytes,
                                            out_data,
                                            sizes.outputSizeInBytes,
                                            &out_handle,
@@ -1142,7 +1226,6 @@ class OptiXDevice : public CUDADevice {
           continue;
         }
 
-        const size_t num_curves = hair->num_curves();
         const size_t num_segments = hair->num_segments();
 
         size_t num_motion_steps = 1;
@@ -1152,7 +1235,18 @@ class OptiXDevice : public CUDADevice {
         }
 
         device_vector<OptixAabb> aabb_data(this, "temp_aabb_data", MEM_READ_ONLY);
-        aabb_data.alloc(num_segments * num_motion_steps);
+#  if OPTIX_ABI_VERSION >= 36
+        device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
+        device_vector<float4> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
+        // Four control points for each curve segment
+        const size_t num_vertices = num_segments * 4;
+        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+          index_data.alloc(num_segments);
+          vertex_data.alloc(num_vertices * num_motion_steps);
+        }
+        else
+#  endif
+          aabb_data.alloc(num_segments * num_motion_steps);
 
         // Get AABBs for each motion step
         for (size_t step = 0; step < num_motion_steps; ++step) {
@@ -1165,44 +1259,127 @@ class OptiXDevice : public CUDADevice {
             keys = motion_keys->data_float3() + attr_offset * hair->curve_keys.size();
           }
 
-          size_t i = step * num_segments;
-          for (size_t j = 0; j < num_curves; ++j) {
-            const Hair::Curve c = hair->get_curve(j);
-
-            for (size_t k = 0; k < c.num_segments(); ++i, ++k) {
-              BoundBox bounds = BoundBox::empty;
-              c.bounds_grow(k, keys, hair->curve_radius.data(), bounds);
-
-              aabb_data[i].minX = bounds.min.x;
-              aabb_data[i].minY = bounds.min.y;
-              aabb_data[i].minZ = bounds.min.z;
-              aabb_data[i].maxX = bounds.max.x;
-              aabb_data[i].maxY = bounds.max.y;
-              aabb_data[i].maxZ = bounds.max.z;
+          for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+            const Hair::Curve curve = hair->get_curve(j);
+
+            for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+#  if OPTIX_ABI_VERSION >= 36
+              if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+                int k0 = curve.first_key + segment;
+                int k1 = k0 + 1;
+                int ka = max(k0 - 1, curve.first_key);
+                int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
+
+                const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
+                const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
+                const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
+                const float4 pw = make_float4(hair->curve_radius[ka],
+                                              hair->curve_radius[k0],
+                                              hair->curve_radius[k1],
+                                              hair->curve_radius[kb]);
+
+                // Convert Catmull-Rom data to Bezier spline
+                static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
+                static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
+                static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
+                static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
+
+                index_data[i] = i * 4;
+                float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+                v[0] = make_float4(
+                    dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
+                v[1] = make_float4(
+                    dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
+                v[2] = make_float4(
+                    dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
+                v[3] = make_float4(
+                    dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+              }
+              else
+#  endif
+              {
+                BoundBox bounds = BoundBox::empty;
+                curve.bounds_grow(segment, keys, hair->curve_radius.data(), bounds);
+
+                const size_t index = step * num_segments + i;
+                aabb_data[index].minX = bounds.min.x;
+                aabb_data[index].minY = bounds.min.y;
+                aabb_data[index].minZ = bounds.min.z;
+                aabb_data[index].maxX = bounds.max.x;
+                aabb_data[index].maxY = bounds.max.y;
+                aabb_data[index].maxZ = bounds.max.z;
+              }
             }
           }
         }
 
         // Upload AABB data to GPU
         aabb_data.copy_to_device();
+#  if OPTIX_ABI_VERSION >= 36
+        index_data.copy_to_device();
+        vertex_data.copy_to_device();
+#  endif
 
         vector<device_ptr> aabb_ptrs;
         aabb_ptrs.reserve(num_motion_steps);
+#  if OPTIX_ABI_VERSION >= 36
+        vector<device_ptr> width_ptrs;
+        vector<device_ptr> vertex_ptrs;
+        width_ptrs.reserve(num_motion_steps);
+        vertex_ptrs.reserve(num_motion_steps);
+#  endif
         for (size_t step = 0; step < num_motion_steps; ++step) {
           aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+#  if OPTIX_ABI_VERSION >= 36
+          const device_ptr base_ptr = vertex_data.device_pointer +
+                                      step * num_vertices * sizeof(float4);
+          width_ptrs.push_back(base_ptr + 3 * sizeof(float));  // Offset by vertex size
+          vertex_ptrs.push_back(base_ptr);
+#  endif
         }
 
-        // Disable visibility test anyhit program, since it is already checked during intersection
-        // Those trace calls that require anyhit can force it with OPTIX_RAY_FLAG_ENFORCE_ANYHIT
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+        // Force a single any-hit call, so shadow record-all behavior works correctly
+        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
         OptixBuildInput build_input = {};
-        build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
-        build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
-        build_input.aabbArray.numPrimitives = num_segments;
-        build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
-        build_input.aabbArray.flags = &build_flags;
-        build_input.aabbArray.numSbtRecords = 1;
-        build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  if OPTIX_ABI_VERSION >= 36
+        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+          build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+          build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+          build_input.curveArray.numPrimitives = num_segments;
+          build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+          build_input.curveArray.numVertices = num_vertices;
+          build_input.curveArray.vertexStrideInBytes = sizeof(float4);
+          build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
+          build_input.curveArray.widthStrideInBytes = sizeof(float4);
+          build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
+          build_input.curveArray.indexStrideInBytes = sizeof(int);
+          build_input.curveArray.flag = build_flags;
+          build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
+        }
+        else
+#  endif
+        {
+          // Disable visibility test any-hit program, since it is already checked during
+          // intersection. Those trace calls that require anyhit can force it with a ray flag.
+          build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+          build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+#  if OPTIX_ABI_VERSION < 23
+          build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+          build_input.aabbArray.numPrimitives = num_segments;
+          build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
+          build_input.aabbArray.flags = &build_flags;
+          build_input.aabbArray.numSbtRecords = 1;
+          build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  else
+          build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+          build_input.customPrimitiveArray.numPrimitives = num_segments;
+          build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+          build_input.customPrimitiveArray.flags = &build_flags;
+          build_input.customPrimitiveArray.numSbtRecords = 1;
+          build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  endif
+        }
 
         // Allocate memory for new BLAS and build it
         OptixTraversableHandle handle;
@@ -1257,8 +1434,8 @@ class OptiXDevice : public CUDADevice {
           vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
         }
 
-        // No special build flags for triangle primitives
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_NONE;
+        // Force a single any-hit call, so shadow record-all behavior works correctly
+        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
         OptixBuildInput build_input = {};
         build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
         build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
@@ -1324,9 +1501,26 @@ class OptiXDevice : public CUDADevice {
       // Set user instance ID to object index
       instance.instanceId = ob->get_device_index();
 
-      // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
-      // See 'scene_intersect_volume' in bvh.h
-      instance.visibilityMask = (ob->geometry->has_volume ? 3 : 1);
+      // Have to have at least one bit in the mask, or else instance would always be culled
+      instance.visibilityMask = 1;
+
+      if (ob->geometry->has_volume) {
+        // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
+        instance.visibilityMask |= 2;
+      }
+
+      if (ob->geometry->type == Geometry::HAIR) {
+        // Same applies to curves (so they can be skipped in local trace calls)
+        instance.visibilityMask |= 4;
+
+#  if OPTIX_ABI_VERSION >= 36
+        if (motion_blur && ob->geometry->has_motion_blur() && DebugFlags().optix.curves_api &&
+            static_cast<const Hair *>(ob->geometry)->curve_shape == CURVE_THICK) {
+          // Select between motion blur and non-motion blur built-in intersection module
+          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+        }
+#  endif
+      }
 
       // Insert motion traversable if object has motion
       if (motion_blur && ob->use_motion()) {
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 80b58f46329..3049f243ae9 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -172,11 +172,11 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
              0.0f,
              ray->t,
              ray->time,
-             0xFF,
+             0xF,
              OPTIX_RAY_FLAG_NONE,
+             0,  // SBT offset for PG_HITD
              0,
              0,
-             0,  // SBT offset for PG_HITD
              p0,
              p1,
              p2,
@@ -264,12 +264,13 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
              0.0f,
              ray->t,
              ray->time,
+             // Skip curves
+             0x3,
              // Need to always call into __anyhit__kernel_optix_local_hit
-             0xFF,
              OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             1,
+             2,  // SBT offset for PG_HITL
+             0,
              0,
-             0,  // SBT offset for PG_HITL
              p0,
              p1,
              p2,
@@ -374,12 +375,12 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
              0.0f,
              ray->t,
              ray->time,
+             0xF,
              // Need to always call into __anyhit__kernel_optix_shadow_all_hit
-             0xFF,
              OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             2,
+             1,  // SBT offset for PG_HITS
+             0,
              0,
-             0,  // SBT offset for PG_HITS
              p0,
              p1,
              *num_hits,
@@ -458,12 +459,12 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
              0.0f,
              ray->t,
              ray->time,
-             // Visibility mask set to only intersect objects with volumes
-             0x02,
+             // Skip everything but volumes
+             0x2,
              OPTIX_RAY_FLAG_NONE,
+             0,  // SBT offset for PG_HITD
              0,
              0,
-             0,  // SBT offset for PG_HITD
              p0,
              p1,
              p2,
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
index c04dbee52cc..06d2c016f5b 100644
--- a/intern/cycles/kernel/geom/geom_curve_intersect.h
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -734,7 +734,6 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
   }
 
   sd->u = isect->u;
-  sd->v = isect->v;
 
   P = P + D * t;
 
@@ -750,6 +749,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
 
     sd->N = normalize(sine * bitangent - cosine * normalize(cross(tangent, bitangent)));
     sd->Ng = -D;
+    sd->v = isect->v;
 
 #  if 0
     /* This approximates the position and geometric normal of a thick curve too,
@@ -764,8 +764,11 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
      * This could be optimized by recording the normal in the intersection,
      * however for Optix this would go beyond the size of the payload. */
     const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, isect->u));
-    sd->Ng = normalize(P - P_inside);
-    sd->N = sd->Ng;
+    const float3 Ng = normalize(P - P_inside);
+
+    sd->N = Ng;
+    sd->Ng = Ng;
+    sd->v = 0.0f;
   }
 
 #  ifdef __DPDU__
diff --git a/intern/cycles/kernel/kernels/optix/kernel_optix.cu b/intern/cycles/kernel/kernels/optix/kernel_optix.cu
index c730d952ed4..3b166e59dfd 100644
--- a/intern/cycles/kernel/kernels/optix/kernel_optix.cu
+++ b/intern/cycles/kernel/kernels/optix/kernel_optix.cu
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// clang-format off
 #include "kernel/kernel_compat_optix.h"
 #include "util/util_atomic.h"
 #include "kernel/kernel_types.h"
@@ -23,6 +24,7 @@
 
 #include "kernel/kernel_path.h"
 #include "kernel/kernel_bake.h"
+// clang-format on
 
 template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
 {
@@ -139,8 +141,8 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
   }
   else {
     if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
-      // Record closest intersection only (do not terminate ray here, since there is no guarantee
-      // about distance ordering in anyhit)
+      // Record closest intersection only
+      // Do not terminate ray here, since there is no guarantee about distance ordering in any-hit
       return optixIgnoreIntersection();
     }
 
@@ -153,15 +155,9 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
   isect->object = get_object_id();
   isect->type = kernel_tex_fetch(__prim_type, isect->prim);
 
-  if (optixIsTriangleHit()) {
-    const float2 barycentrics = optixGetTriangleBarycentrics();
-    isect->u = 1.0f - barycentrics.y - barycentrics.x;
-    isect->v = barycentrics.x;
-  }
-  else {
-    isect->u = __uint_as_float(optixGetAttribute_0());
-    isect->v = __uint_as_float(optixGetAttribute_1());
-  }
+  const float2 barycentrics = optixGetTriangleBarycentrics();
+  isect->u = 1.0f - barycentrics.y - barycentrics.x;
+  isect->v = barycentrics.x;
 
   // Record geometric normal
   const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
@@ -198,10 +194,18 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
     isect->u = 1.0f - barycentrics.y - barycentrics.x;
     isect->v = barycentrics.x;
   }
+#  ifdef __HAIR__
   else {
-    isect->u = __uint_as_float(optixGetAttribute_0());
+    const float u = __uint_as_float(optixGetAttribute_0());
+    isect->u = u;
     isect->v = __uint_as_float(optixGetAttribute_1());
+
+    // Filter out curve endcaps
+    if (u == 0.0f || u == 1.0f) {
+      return optixIgnoreIntersection();
+    }
   }
+#  endif
 
 #  ifdef __TRANSPARENT_SHADOWS__
   // Detect if this surface has a shader with transparent shadows
@@ -213,7 +217,6 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 #  ifdef __TRANSPARENT_SHADOWS__
   }
 
-  // TODO(pmours): Do we need REQUIRE_UNIQUE_ANYHIT for this to work?
   optixSetPayload_2(optixGetPayload_2() + 1);  // num_hits++
 
   // Continue tracing
@@ -227,13 +230,25 @@ extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
   uint visibility = optixGetPayload_4();
 #ifdef __VISIBILITY_FLAG__
   const uint prim = optixGetPrimitiveIndex();
-  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0)
+  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
     return optixIgnoreIntersection();
+  }
+#endif
+
+#ifdef __HAIR__
+  if (!optixIsTriangleHit()) {
+    // Filter out curve endcaps
+    const float u = __uint_as_float(optixGetAttribute_0());
+    if (u == 0.0f || u == 1.0f) {
+      return optixIgnoreIntersection();
+    }
+  }
 #endif
 
   // Shadow ray early termination
-  if (visibility & PATH_RAY_SHADOW_OPAQUE)
+  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
     return optixTerminateRay();
+  }
 }
 
 extern "C" __global__ void __closesthit__kernel_optix_hit()
@@ -250,7 +265,7 @@ extern "C" __global__ void __closesthit__kernel_optix_hit()
     optixSetPayload_2(__float_as_uint(barycentrics.x));
   }
   else {
-    optixSetPayload_1(optixGetAttribute_0());
+    optixSetPayload_1(optixGetAttribute_0());  // Same as 'optixGetCurveParameter()'
     optixSetPayload_2(optixGetAttribute_1());
   }
 }
@@ -286,7 +301,6 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
                             __float_as_int(isect.u),   // Attribute_0
                             __float_as_int(isect.v));  // Attribute_1
   }
-
 }
 
 extern "C" __global__ void __intersection__curve_ribbon()
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 6ad4f709ab5..74ecefa1917 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -83,6 +83,7 @@ DebugFlags::OptiX::OptiX()
 void DebugFlags::OptiX::reset()
 {
   cuda_streams = 1;
+  curves_api = false;
 }
 
 DebugFlags::OpenCL::OpenCL() : device_type(DebugFlags::OpenCL::DEVICE_ALL), debug(false)
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index da9f5408b59..6ac4beb55b8 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -108,6 +108,9 @@ class DebugFlags {
 
     /* Number of CUDA streams to launch kernels concurrently from. */
     int cuda_streams;
+
+    /* Use OptiX curves API for hair instead of custom implementation. */
+    bool curves_api;
   };
 
   /* Descriptor of OpenCL feature-set to be used. */