Cycles: Add support for native OptiX curve primitive

This patch adds support for the curve primitive from OptiX to Cycles. It's currently hidden behind a debug option, since there can be some slight rendering differences still (because no backface culling is performed and something seems off with endcaps). The curve primitive was added with the OptiX 7.1 SDK and requires a r450 driver or newer, so this also updates the codebase to be able to build with the new SDK. Reviewed By: brecht Differential Revision: https://developer.blender.org/D8223
author: Patrick Mours <pmours@nvidia.com> 2020-07-06 13:25:54 +0300
committer: Patrick Mours <pmours@nvidia.com> 2020-07-07 16:39:02 +0300
commit: 737bd549b6eeee81f0573ad8e305fb8d888d82ec (patch)
tree: fc8aaca871557d652287cb3d4f9ee3033ff06e46 /intern
parent: 95f0f312799e10e4a9f5c884f8f20ec76d0ff363 (diff)
9 files changed, 303 insertions, 84 deletions
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 0cc419c7025..def35573ec3 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -830,6 +830,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     debug_use_cuda_split_kernel: BoolProperty(name="Split Kernel", default=False)
 
     debug_optix_cuda_streams: IntProperty(name="CUDA Streams", default=1, min=1)
+    debug_optix_curves_api: BoolProperty(name="Native OptiX Curve Primitive", default=False)
 
     debug_opencl_kernel_type: EnumProperty(
         name="OpenCL Kernel Type",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index bf81a360020..cce3f805b39 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -2031,6 +2031,7 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
         col = layout.column()
         col.label(text="OptiX Flags:")
         col.prop(cscene, "debug_optix_cuda_streams")
+        col.prop(cscene, "debug_optix_curves_api")
 
         col.separator()
 
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 3e595c3ee52..25c77b74ce3 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -92,6 +92,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
   flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
   /* Synchronize OptiX flags. */
   flags.optix.cuda_streams = get_int(cscene, "debug_optix_cuda_streams");
+  flags.optix.curves_api = get_boolean(cscene, "debug_optix_curves_api");
   /* Synchronize OpenCL device type. */
   switch (get_enum(cscene, "debug_opencl_device_type")) {
     case 0:
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index ececca3df53..266222c74c5 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -131,8 +131,12 @@ class OptiXDevice : public CUDADevice {
     PG_RGEN,
     PG_MISS,
     PG_HITD,  // Default hit group
-    PG_HITL,  // __BVH_LOCAL__ hit group
     PG_HITS,  // __SHADOW_RECORD_ALL__ hit group
+    PG_HITL,  // __BVH_LOCAL__ hit group (only used for triangles)
+#  if OPTIX_ABI_VERSION >= 36
+    PG_HITD_MOTION,
+    PG_HITS_MOTION,
+#  endif
 #  ifdef WITH_CYCLES_DEBUG
     PG_EXCP,
 #  endif
@@ -177,6 +181,7 @@ class OptiXDevice : public CUDADevice {
   OptixDeviceContext context = NULL;
 
   OptixModule optix_module = NULL;  // All necessary OptiX kernels are in one module
+  OptixModule builtin_modules[2] = {};
   OptixPipeline pipelines[NUM_PIPELINES] = {};
 
   bool motion_blur = false;
@@ -264,6 +269,9 @@ class OptiXDevice : public CUDADevice {
     // Unload modules
     if (optix_module != NULL)
       optixModuleDestroy(optix_module);
+    for (unsigned int i = 0; i < 2; ++i)
+      if (builtin_modules[i] != NULL)
+        optixModuleDestroy(builtin_modules[i]);
     for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
       if (pipelines[i] != NULL)
         optixPipelineDestroy(pipelines[i]);
@@ -338,6 +346,12 @@ class OptiXDevice : public CUDADevice {
       optixModuleDestroy(optix_module);
       optix_module = NULL;
     }
+    for (unsigned int i = 0; i < 2; ++i) {
+      if (builtin_modules[i] != NULL) {
+        optixModuleDestroy(builtin_modules[i]);
+        builtin_modules[i] = NULL;
+      }
+    }
     for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
       if (pipelines[i] != NULL) {
         optixPipelineDestroy(pipelines[i]);
@@ -369,6 +383,18 @@ class OptiXDevice : public CUDADevice {
 #  endif
     pipeline_options.pipelineLaunchParamsVariableName = "__params";  // See kernel_globals.h
 
+#  if OPTIX_ABI_VERSION >= 36
+    pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
+    if (requested_features.use_hair) {
+      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
+        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+      }
+      else {
+        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+      }
+    }
+#  endif
+
     // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
     // This is necessary since objects may be reported to have motion if the Vector pass is
     // active, but may still need to be rendered without motion blur if that isn't active as well
@@ -442,6 +468,34 @@ class OptiXDevice : public CUDADevice {
         group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
         group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
       }
+
+#  if OPTIX_ABI_VERSION >= 36
+      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
+        OptixBuiltinISOptions builtin_options;
+        builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+        builtin_options.usesMotionBlur = false;
+
+        check_result_optix_ret(optixBuiltinISModuleGet(
+            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
+
+        group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
+        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
+        group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
+        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
+
+        if (motion_blur) {
+          builtin_options.usesMotionBlur = true;
+
+          check_result_optix_ret(optixBuiltinISModuleGet(
+              context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
+
+          group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
+          group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
+          group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
+          group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
+        }
+      }
+#  endif
     }
 
     if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
@@ -493,8 +547,14 @@ class OptiXDevice : public CUDADevice {
     unsigned int trace_css = stack_size[PG_HITD].cssCH;
     // This is based on the maximum of closest-hit and any-hit/intersection programs
     trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
-    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
     trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+#  if OPTIX_ABI_VERSION >= 36
+    trace_css = std::max(trace_css,
+                         stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
+    trace_css = std::max(trace_css,
+                         stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+#  endif
 
     OptixPipelineLinkOptions link_options;
     link_options.maxTraceDepth = 1;
@@ -503,17 +563,23 @@ class OptiXDevice : public CUDADevice {
 #  else
     link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
 #  endif
-    link_options.overrideUsesMotionBlur = pipeline_options.usesMotionBlur;
+#  if OPTIX_ABI_VERSION < 24
+    link_options.overrideUsesMotionBlur = motion_blur;
+#  endif
 
     {  // Create path tracing pipeline
       OptixProgramGroup pipeline_groups[] = {
-          groups[PG_RGEN],
-          groups[PG_MISS],
-          groups[PG_HITD],
-          groups[PG_HITS],
-          groups[PG_HITL],
+        groups[PG_RGEN],
+        groups[PG_MISS],
+        groups[PG_HITD],
+        groups[PG_HITS],
+        groups[PG_HITL],
+#  if OPTIX_ABI_VERSION >= 36
+        groups[PG_HITD_MOTION],
+        groups[PG_HITS_MOTION],
+#  endif
 #  ifdef WITH_CYCLES_DEBUG
-          groups[PG_EXCP],
+        groups[PG_EXCP],
 #  endif
       };
       check_result_optix_ret(
@@ -530,8 +596,8 @@ class OptiXDevice : public CUDADevice {
       const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
 
       // Set stack size depending on pipeline options
-      check_result_optix_ret(optixPipelineSetStackSize(
-          pipelines[PIP_PATH_TRACE], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2)));
+      check_result_optix_ret(
+          optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE], 0, 0, css, (motion_blur ? 3 : 2)));
     }
 
     // Only need to create shader evaluation pipeline if one of these features is used:
@@ -541,15 +607,19 @@ class OptiXDevice : public CUDADevice {
 
     if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
       OptixProgramGroup pipeline_groups[] = {
-          groups[PG_BAKE],
-          groups[PG_DISP],
-          groups[PG_BACK],
-          groups[PG_MISS],
-          groups[PG_HITD],
-          groups[PG_HITS],
-          groups[PG_HITL],
+        groups[PG_BAKE],
+        groups[PG_DISP],
+        groups[PG_BACK],
+        groups[PG_MISS],
+        groups[PG_HITD],
+        groups[PG_HITS],
+        groups[PG_HITL],
+#  if OPTIX_ABI_VERSION >= 36
+        groups[PG_HITD_MOTION],
+        groups[PG_HITS_MOTION],
+#  endif
 #  ifdef WITH_CYCLES_DEBUG
-          groups[PG_EXCP],
+        groups[PG_EXCP],
 #  endif
       };
       check_result_optix_ret(
@@ -672,7 +742,11 @@ class OptiXDevice : public CUDADevice {
       sbt_params.missRecordCount = 1;
       sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
       sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITL, PG_HITS
+#  if OPTIX_ABI_VERSION >= 36
+      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
+#  else
+      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
+#  endif
 
       // Launch the ray generation program
       check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
@@ -836,7 +910,9 @@ class OptiXDevice : public CUDADevice {
         assert(task.denoising.optix_input_passes >= 1 && task.denoising.optix_input_passes <= 3);
         denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
             OPTIX_DENOISER_INPUT_RGB + (task.denoising.optix_input_passes - 1));
+#  if OPTIX_ABI_VERSION < 28
         denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
+#  endif
         check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
         check_result_optix_ret(
             optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
@@ -849,7 +925,11 @@ class OptiXDevice : public CUDADevice {
       check_result_optix_ret(
           optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
 
+#  if OPTIX_ABI_VERSION < 28
       const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
+#  else
+      const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
+#  endif
       const size_t scratch_offset = sizes.stateSizeInBytes;
 
       // Allocate denoiser state if tile size has changed since last setup
@@ -993,7 +1073,11 @@ class OptiXDevice : public CUDADevice {
       sbt_params.missRecordCount = 1;
       sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
       sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITL, PG_HITS
+#  if OPTIX_ABI_VERSION >= 36
+      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
+#  else
+      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
+#  endif
 
       check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
                                      cuda_stream[thread_index],
@@ -1070,7 +1154,7 @@ class OptiXDevice : public CUDADevice {
                                            &build_input,
                                            1,
                                            temp_mem.device_pointer,
-                                           temp_mem.device_size,
+                                           sizes.tempSizeInBytes,
                                            out_data,
                                            sizes.outputSizeInBytes,
                                            &out_handle,
@@ -1142,7 +1226,6 @@ class OptiXDevice : public CUDADevice {
           continue;
         }
 
-        const size_t num_curves = hair->num_curves();
         const size_t num_segments = hair->num_segments();
 
         size_t num_motion_steps = 1;
@@ -1152,7 +1235,18 @@ class OptiXDevice : public CUDADevice {
         }
 
         device_vector<OptixAabb> aabb_data(this, "temp_aabb_data", MEM_READ_ONLY);
-        aabb_data.alloc(num_segments * num_motion_steps);
+#  if OPTIX_ABI_VERSION >= 36
+        device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
+        device_vector<float4> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
+        // Four control points for each curve segment
+        const size_t num_vertices = num_segments * 4;
+        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+          index_data.alloc(num_segments);
+          vertex_data.alloc(num_vertices * num_motion_steps);
+        }
+        else
+#  endif
+          aabb_data.alloc(num_segments * num_motion_steps);
 
         // Get AABBs for each motion step
         for (size_t step = 0; step < num_motion_steps; ++step) {
@@ -1165,44 +1259,127 @@ class OptiXDevice : public CUDADevice {
             keys = motion_keys->data_float3() + attr_offset * hair->curve_keys.size();
           }
 
-          size_t i = step * num_segments;
-          for (size_t j = 0; j < num_curves; ++j) {
-            const Hair::Curve c = hair->get_curve(j);
-
-            for (size_t k = 0; k < c.num_segments(); ++i, ++k) {
-              BoundBox bounds = BoundBox::empty;
-              c.bounds_grow(k, keys, hair->curve_radius.data(), bounds);
-
-              aabb_data[i].minX = bounds.min.x;
-              aabb_data[i].minY = bounds.min.y;
-              aabb_data[i].minZ = bounds.min.z;
-              aabb_data[i].maxX = bounds.max.x;
-              aabb_data[i].maxY = bounds.max.y;
-              aabb_data[i].maxZ = bounds.max.z;
+          for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+            const Hair::Curve curve = hair->get_curve(j);
+
+            for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+#  if OPTIX_ABI_VERSION >= 36
+              if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+                int k0 = curve.first_key + segment;
+                int k1 = k0 + 1;
+                int ka = max(k0 - 1, curve.first_key);
+                int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
+
+                const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
+                const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
+                const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
+                const float4 pw = make_float4(hair->curve_radius[ka],
+                                              hair->curve_radius[k0],
+                                              hair->curve_radius[k1],
+                                              hair->curve_radius[kb]);
+
+                // Convert Catmull-Rom data to Bezier spline
+                static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
+                static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
+                static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
+                static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
+
+                index_data[i] = i * 4;
+                float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+                v[0] = make_float4(
+                    dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
+                v[1] = make_float4(
+                    dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
+                v[2] = make_float4(
+                    dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
+                v[3] = make_float4(
+                    dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+              }
+              else
+#  endif
+              {
+                BoundBox bounds = BoundBox::empty;
+                curve.bounds_grow(segment, keys, hair->curve_radius.data(), bounds);
+
+                const size_t index = step * num_segments + i;
+                aabb_data[index].minX = bounds.min.x;
+                aabb_data[index].minY = bounds.min.y;
+                aabb_data[index].minZ = bounds.min.z;
+                aabb_data[index].maxX = bounds.max.x;
+                aabb_data[index].maxY = bounds.max.y;
+                aabb_data[index].maxZ = bounds.max.z;
+              }
             }
           }
         }
 
         // Upload AABB data to GPU
         aabb_data.copy_to_device();
+#  if OPTIX_ABI_VERSION >= 36
+        index_data.copy_to_device();
+        vertex_data.copy_to_device();
+#  endif
 
         vector<device_ptr> aabb_ptrs;
         aabb_ptrs.reserve(num_motion_steps);
+#  if OPTIX_ABI_VERSION >= 36
+        vector<device_ptr> width_ptrs;
+        vector<device_ptr> vertex_ptrs;
+        width_ptrs.reserve(num_motion_steps);
+        vertex_ptrs.reserve(num_motion_steps);
+#  endif
         for (size_t step = 0; step < num_motion_steps; ++step) {
           aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+#  if OPTIX_ABI_VERSION >= 36
+          const device_ptr base_ptr = vertex_data.device_pointer +
+                                      step * num_vertices * sizeof(float4);
+          width_ptrs.push_back(base_ptr + 3 * sizeof(float));  // Offset by vertex size
+          vertex_ptrs.push_back(base_ptr);
+#  endif
         }
 
-        // Disable visibility test anyhit program, since it is already checked during intersection
-        // Those trace calls that require anyhit can force it with OPTIX_RAY_FLAG_ENFORCE_ANYHIT
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+        // Force a single any-hit call, so shadow record-all behavior works correctly
+        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
         OptixBuildInput build_input = {};
-        build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
-        build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
-        build_input.aabbArray.numPrimitives = num_segments;
-        build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
-        build_input.aabbArray.flags = &build_flags;
-        build_input.aabbArray.numSbtRecords = 1;
-        build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  if OPTIX_ABI_VERSION >= 36
+        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+          build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+          build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+          build_input.curveArray.numPrimitives = num_segments;
+          build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+          build_input.curveArray.numVertices = num_vertices;
+          build_input.curveArray.vertexStrideInBytes = sizeof(float4);
+          build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
+          build_input.curveArray.widthStrideInBytes = sizeof(float4);
+          build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
+          build_input.curveArray.indexStrideInBytes = sizeof(int);
+          build_input.curveArray.flag = build_flags;
+          build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
+        }
+        else
+#  endif
+        {
+          // Disable visibility test any-hit program, since it is already checked during
+          // intersection. Those trace calls that require anyhit can force it with a ray flag.
+          build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+          build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+#  if OPTIX_ABI_VERSION < 23
+          build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+          build_input.aabbArray.numPrimitives = num_segments;
+          build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
+          build_input.aabbArray.flags = &build_flags;
+          build_input.aabbArray.numSbtRecords = 1;
+          build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  else
+          build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+          build_input.customPrimitiveArray.numPrimitives = num_segments;
+          build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+          build_input.customPrimitiveArray.flags = &build_flags;
+          build_input.customPrimitiveArray.numSbtRecords = 1;
+          build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  endif
+        }
 
         // Allocate memory for new BLAS and build it
         OptixTraversableHandle handle;
@@ -1257,8 +1434,8 @@ class OptiXDevice : public CUDADevice {
           vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
         }
 
-        // No special build flags for triangle primitives
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_NONE;
+        // Force a single any-hit call, so shadow record-all behavior works correctly
+        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
         OptixBuildInput build_input = {};
         build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
         build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
@@ -1324,9 +1501,26 @@ class OptiXDevice : public CUDADevice {
       // Set user instance ID to object index
       instance.instanceId = ob->get_device_index();
 
-      // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
-      // See 'scene_intersect_volume' in bvh.h
-      instance.visibilityMask = (ob->geometry->has_volume ? 3 : 1);
+      // Have to have at least one bit in the mask, or else instance would always be culled
+      instance.visibilityMask = 1;
+
+      if (ob->geometry->has_volume) {
+        // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
+        instance.visibilityMask |= 2;
+      }
+
+      if (ob->geometry->type == Geometry::HAIR) {
+        // Same applies to curves (so they can be skipped in local trace calls)
+        instance.visibilityMask |= 4;
+
+#  if OPTIX_ABI_VERSION >= 36
+        if (motion_blur && ob->geometry->has_motion_blur() && DebugFlags().optix.curves_api &&
+            static_cast<const Hair *>(ob->geometry)->curve_shape == CURVE_THICK) {
+          // Select between motion blur and non-motion blur built-in intersection module
+          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+        }
+#  endif
+      }
 
       // Insert motion traversable if object has motion
       if (motion_blur && ob->use_motion()) {
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 80b58f46329..3049f243ae9 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -172,11 +172,11 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
              0.0f,
              ray->t,
              ray->time,
-             0xFF,
+             0xF,
              OPTIX_RAY_FLAG_NONE,
+             0,  // SBT offset for PG_HITD
              0,
              0,
-             0,  // SBT offset for PG_HITD
              p0,
              p1,
              p2,
@@ -264,12 +264,13 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
              0.0f,
              ray->t,
              ray->time,
+             // Skip curves
+             0x3,
              // Need to always call into __anyhit__kernel_optix_local_hit
-             0xFF,
              OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             1,
+             2,  // SBT offset for PG_HITL
+             0,
              0,
-             0,  // SBT offset for PG_HITL
              p0,
              p1,
              p2,
@@ -374,12 +375,12 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
              0.0f,
              ray->t,
              ray->time,
+             0xF,
              // Need to always call into __anyhit__kernel_optix_shadow_all_hit
-             0xFF,
              OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             2,
+             1,  // SBT offset for PG_HITS
+             0,
              0,
-             0,  // SBT offset for PG_HITS
              p0,
              p1,
              *num_hits,
@@ -458,12 +459,12 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
              0.0f,
              ray->t,
              ray->time,
-             // Visibility mask set to only intersect objects with volumes
-             0x02,
+             // Skip everything but volumes
+             0x2,
              OPTIX_RAY_FLAG_NONE,
+             0,  // SBT offset for PG_HITD
              0,
              0,
-             0,  // SBT offset for PG_HITD
              p0,
              p1,
              p2,
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
index c04dbee52cc..06d2c016f5b 100644
--- a/intern/cycles/kernel/geom/geom_curve_intersect.h
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -734,7 +734,6 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
   }
 
   sd->u = isect->u;
-  sd->v = isect->v;
 
   P = P + D * t;
 
@@ -750,6 +749,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
 
     sd->N = normalize(sine * bitangent - cosine * normalize(cross(tangent, bitangent)));
     sd->Ng = -D;
+    sd->v = isect->v;
 
 #  if 0
     /* This approximates the position and geometric normal of a thick curve too,
@@ -764,8 +764,11 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
      * This could be optimized by recording the normal in the intersection,
      * however for Optix this would go beyond the size of the payload. */
     const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, isect->u));
-    sd->Ng = normalize(P - P_inside);
-    sd->N = sd->Ng;
+    const float3 Ng = normalize(P - P_inside);
+
+    sd->N = Ng;
+    sd->Ng = Ng;
+    sd->v = 0.0f;
   }
 
 #  ifdef __DPDU__
diff --git a/intern/cycles/kernel/kernels/optix/kernel_optix.cu b/intern/cycles/kernel/kernels/optix/kernel_optix.cu
index c730d952ed4..3b166e59dfd 100644
--- a/intern/cycles/kernel/kernels/optix/kernel_optix.cu
+++ b/intern/cycles/kernel/kernels/optix/kernel_optix.cu
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+// clang-format off
 #include "kernel/kernel_compat_optix.h"
 #include "util/util_atomic.h"
 #include "kernel/kernel_types.h"
@@ -23,6 +24,7 @@
 
 #include "kernel/kernel_path.h"
 #include "kernel/kernel_bake.h"
+// clang-format on
 
 template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
 {
@@ -139,8 +141,8 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
   }
   else {
     if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
-      // Record closest intersection only (do not terminate ray here, since there is no guarantee
-      // about distance ordering in anyhit)
+      // Record closest intersection only
+      // Do not terminate ray here, since there is no guarantee about distance ordering in any-hit
       return optixIgnoreIntersection();
     }
 
@@ -153,15 +155,9 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
   isect->object = get_object_id();
   isect->type = kernel_tex_fetch(__prim_type, isect->prim);
 
-  if (optixIsTriangleHit()) {
-    const float2 barycentrics = optixGetTriangleBarycentrics();
-    isect->u = 1.0f - barycentrics.y - barycentrics.x;
-    isect->v = barycentrics.x;
-  }
-  else {
-    isect->u = __uint_as_float(optixGetAttribute_0());
-    isect->v = __uint_as_float(optixGetAttribute_1());
-  }
+  const float2 barycentrics = optixGetTriangleBarycentrics();
+  isect->u = 1.0f - barycentrics.y - barycentrics.x;
+  isect->v = barycentrics.x;
 
   // Record geometric normal
   const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
@@ -198,10 +194,18 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
     isect->u = 1.0f - barycentrics.y - barycentrics.x;
     isect->v = barycentrics.x;
   }
+#  ifdef __HAIR__
   else {
-    isect->u = __uint_as_float(optixGetAttribute_0());
+    const float u = __uint_as_float(optixGetAttribute_0());
+    isect->u = u;
     isect->v = __uint_as_float(optixGetAttribute_1());
+
+    // Filter out curve endcaps
+    if (u == 0.0f || u == 1.0f) {
+      return optixIgnoreIntersection();
+    }
   }
+#  endif
 
 #  ifdef __TRANSPARENT_SHADOWS__
   // Detect if this surface has a shader with transparent shadows
@@ -213,7 +217,6 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 #  ifdef __TRANSPARENT_SHADOWS__
   }
 
-  // TODO(pmours): Do we need REQUIRE_UNIQUE_ANYHIT for this to work?
   optixSetPayload_2(optixGetPayload_2() + 1);  // num_hits++
 
   // Continue tracing
@@ -227,13 +230,25 @@ extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
   uint visibility = optixGetPayload_4();
 #ifdef __VISIBILITY_FLAG__
   const uint prim = optixGetPrimitiveIndex();
-  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0)
+  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
     return optixIgnoreIntersection();
+  }
+#endif
+
+#ifdef __HAIR__
+  if (!optixIsTriangleHit()) {
+    // Filter out curve endcaps
+    const float u = __uint_as_float(optixGetAttribute_0());
+    if (u == 0.0f || u == 1.0f) {
+      return optixIgnoreIntersection();
+    }
+  }
 #endif
 
   // Shadow ray early termination
-  if (visibility & PATH_RAY_SHADOW_OPAQUE)
+  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
     return optixTerminateRay();
+  }
 }
 
 extern "C" __global__ void __closesthit__kernel_optix_hit()
@@ -250,7 +265,7 @@ extern "C" __global__ void __closesthit__kernel_optix_hit()
     optixSetPayload_2(__float_as_uint(barycentrics.x));
   }
   else {
-    optixSetPayload_1(optixGetAttribute_0());
+    optixSetPayload_1(optixGetAttribute_0());  // Same as 'optixGetCurveParameter()'
     optixSetPayload_2(optixGetAttribute_1());
   }
 }
@@ -286,7 +301,6 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
                             __float_as_int(isect.u),   // Attribute_0
                             __float_as_int(isect.v));  // Attribute_1
   }
-
 }
 
 extern "C" __global__ void __intersection__curve_ribbon()
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 6ad4f709ab5..74ecefa1917 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -83,6 +83,7 @@ DebugFlags::OptiX::OptiX()
 void DebugFlags::OptiX::reset()
 {
   cuda_streams = 1;
+  curves_api = false;
 }
 
 DebugFlags::OpenCL::OpenCL() : device_type(DebugFlags::OpenCL::DEVICE_ALL), debug(false)
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index da9f5408b59..6ac4beb55b8 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -108,6 +108,9 @@ class DebugFlags {
 
     /* Number of CUDA streams to launch kernels concurrently from. */
     int cuda_streams;
+
+    /* Use OptiX curves API for hair instead of custom implementation. */
+    bool curves_api;
   };
 
   /* Descriptor of OpenCL feature-set to be used. */
author	Patrick Mours <pmours@nvidia.com>	2020-07-06 13:25:54 +0300
committer	Patrick Mours <pmours@nvidia.com>	2020-07-07 16:39:02 +0300
commit	737bd549b6eeee81f0573ad8e305fb8d888d82ec (patch)
tree	fc8aaca871557d652287cb3d4f9ee3033ff06e46 /intern
parent	95f0f312799e10e4a9f5c884f8f20ec76d0ff363 (diff)