1 files changed, 257 insertions, 104 deletions
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index bb690551c04..38cc3330ebd 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -28,6 +28,7 @@
 #  include "scene/mesh.h"
 #  include "scene/object.h"
 #  include "scene/pass.h"
+#  include "scene/pointcloud.h"
 #  include "scene/scene.h"
 
 #  include "util/debug.h"
@@ -41,17 +42,19 @@
 #  define __KERNEL_OPTIX__
 #  include "kernel/device/optix/globals.h"
 
+#  include <optix_denoiser_tiling.h>
+
 CCL_NAMESPACE_BEGIN
 
 OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
-    : device(device), queue(device), state(device, "__denoiser_state")
+    : device(device), queue(device), state(device, "__denoiser_state", true)
 {
 }
 
 OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
     : CUDADevice(info, stats, profiler),
       sbt_data(this, "__sbt", MEM_READ_ONLY),
-      launch_params(this, "__params"),
+      launch_params(this, "__params", false),
       denoiser_(this)
 {
   /* Make the CUDA context current. */
@@ -208,11 +211,15 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   }
   else {
     module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
-    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
   }
 
   module_options.boundValues = nullptr;
   module_options.numBoundValues = 0;
+#  if OPTIX_ABI_VERSION >= 55
+  module_options.payloadTypes = nullptr;
+  module_options.numPayloadTypes = 0;
+#  endif
 
   OptixPipelineCompileOptions pipeline_options = {};
   /* Default to no motion blur and two-level graph, since it is the fastest option. */
@@ -227,11 +234,18 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
   if (kernel_features & KERNEL_FEATURE_HAIR) {
     if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+#  if OPTIX_ABI_VERSION >= 55
+      pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CATMULLROM;
+#  else
       pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+#  endif
     }
     else
       pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
   }
+  if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
+    pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+  }
 
   /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
    * This is necessary since objects may be reported to have motion if the Vector pass is
@@ -324,7 +338,13 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
       /* Built-in thick curve intersection. */
       OptixBuiltinISOptions builtin_options = {};
+#  if OPTIX_ABI_VERSION >= 55
+      builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM;
+      builtin_options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE;
+      builtin_options.curveEndcapFlags = OPTIX_CURVE_ENDCAP_DEFAULT; /* Disable end-caps. */
+#  else
       builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+#  endif
       builtin_options.usesMotionBlur = false;
 
       optix_assert(optixBuiltinISModuleGet(
@@ -356,6 +376,18 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     }
   }
 
+  /* Pointclouds */
+  if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
+    group_descs[PG_HITD_POINTCLOUD] = group_descs[PG_HITD];
+    group_descs[PG_HITD_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+    group_descs[PG_HITD_POINTCLOUD].hitgroup.moduleIS = optix_module;
+    group_descs[PG_HITD_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point";
+    group_descs[PG_HITS_POINTCLOUD] = group_descs[PG_HITS];
+    group_descs[PG_HITS_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+    group_descs[PG_HITS_POINTCLOUD].hitgroup.moduleIS = optix_module;
+    group_descs[PG_HITS_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point";
+  }
+
   if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) {
     /* Add hit group for local intersections. */
     group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
@@ -403,6 +435,10 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
                        stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
   trace_css = std::max(trace_css,
                        stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+  trace_css = std::max(
+      trace_css, stack_size[PG_HITD_POINTCLOUD].cssIS + stack_size[PG_HITD_POINTCLOUD].cssAH);
+  trace_css = std::max(
+      trace_css, stack_size[PG_HITS_POINTCLOUD].cssIS + stack_size[PG_HITS_POINTCLOUD].cssAH);
 
   OptixPipelineLinkOptions link_options = {};
   link_options.maxTraceDepth = 1;
@@ -411,7 +447,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
   }
   else {
-    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
   }
 
   if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
@@ -428,6 +464,10 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
       pipeline_groups.push_back(groups[PG_HITD_MOTION]);
       pipeline_groups.push_back(groups[PG_HITS_MOTION]);
     }
+    if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
+      pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
+      pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
+    }
     pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
     pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
 
@@ -467,6 +507,10 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
       pipeline_groups.push_back(groups[PG_HITD_MOTION]);
       pipeline_groups.push_back(groups[PG_HITS_MOTION]);
     }
+    if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
+      pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
+      pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
+    }
 
     optix_assert(optixPipelineCreate(context,
                                      &pipeline_options,
@@ -507,7 +551,7 @@ class OptiXDevice::DenoiseContext {
       : denoise_params(task.params),
         render_buffers(task.render_buffers),
         buffer_params(task.buffer_params),
-        guiding_buffer(device, "denoiser guiding passes buffer"),
+        guiding_buffer(device, "denoiser guiding passes buffer", true),
         num_samples(task.num_samples)
   {
     num_input_passes = 1;
@@ -522,9 +566,9 @@ class OptiXDevice::DenoiseContext {
       }
     }
 
-    const int num_guiding_passes = num_input_passes - 1;
+    use_guiding_passes = (num_input_passes - 1) > 0;
 
-    if (num_guiding_passes) {
+    if (use_guiding_passes) {
       if (task.allow_inplace_modification) {
         guiding_params.device_pointer = render_buffers->buffer.device_pointer;
 
@@ -577,6 +621,7 @@ class OptiXDevice::DenoiseContext {
 
   /* Number of input passes. Including the color and extra auxiliary passes. */
   int num_input_passes = 0;
+  bool use_guiding_passes = false;
   bool use_pass_albedo = false;
   bool use_pass_normal = false;
 
@@ -653,22 +698,22 @@ bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context)
 
   const int work_size = buffer_params.width * buffer_params.height;
 
-  void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
-                  const_cast<int *>(&context.guiding_params.pass_stride),
-                  const_cast<int *>(&context.guiding_params.pass_albedo),
-                  const_cast<int *>(&context.guiding_params.pass_normal),
-                  &context.render_buffers->buffer.device_pointer,
-                  const_cast<int *>(&buffer_params.offset),
-                  const_cast<int *>(&buffer_params.stride),
-                  const_cast<int *>(&buffer_params.pass_stride),
-                  const_cast<int *>(&context.pass_sample_count),
-                  const_cast<int *>(&context.pass_denoising_albedo),
-                  const_cast<int *>(&context.pass_denoising_normal),
-                  const_cast<int *>(&buffer_params.full_x),
-                  const_cast<int *>(&buffer_params.full_y),
-                  const_cast<int *>(&buffer_params.width),
-                  const_cast<int *>(&buffer_params.height),
-                  const_cast<int *>(&context.num_samples)};
+  DeviceKernelArguments args(&context.guiding_params.device_pointer,
+                             &context.guiding_params.pass_stride,
+                             &context.guiding_params.pass_albedo,
+                             &context.guiding_params.pass_normal,
+                             &context.render_buffers->buffer.device_pointer,
+                             &buffer_params.offset,
+                             &buffer_params.stride,
+                             &buffer_params.pass_stride,
+                             &context.pass_sample_count,
+                             &context.pass_denoising_albedo,
+                             &context.pass_denoising_normal,
+                             &buffer_params.full_x,
+                             &buffer_params.full_y,
+                             &buffer_params.width,
+                             &buffer_params.height,
+                             &context.num_samples);
 
   return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args);
 }
@@ -679,11 +724,11 @@ bool OptiXDevice::denoise_filter_guiding_set_fake_albedo(DenoiseContext &context
 
   const int work_size = buffer_params.width * buffer_params.height;
 
-  void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
-                  const_cast<int *>(&context.guiding_params.pass_stride),
-                  const_cast<int *>(&context.guiding_params.pass_albedo),
-                  const_cast<int *>(&buffer_params.width),
-                  const_cast<int *>(&buffer_params.height)};
+  DeviceKernelArguments args(&context.guiding_params.device_pointer,
+                             &context.guiding_params.pass_stride,
+                             &context.guiding_params.pass_albedo,
+                             &buffer_params.width,
+                             &buffer_params.height);
 
   return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args);
 }
@@ -708,7 +753,7 @@ void OptiXDevice::denoise_pass(DenoiseContext &context, PassType pass_type)
       return;
     }
   }
-  else if (!context.albedo_replaced_with_fake) {
+  else if (context.use_guiding_passes && !context.albedo_replaced_with_fake) {
     context.albedo_replaced_with_fake = true;
     if (!denoise_filter_guiding_set_fake_albedo(context)) {
       LOG(ERROR) << "Error replacing real albedo with the fake one.";
@@ -779,15 +824,15 @@ bool OptiXDevice::denoise_filter_color_preprocess(DenoiseContext &context, const
 
   const int work_size = buffer_params.width * buffer_params.height;
 
-  void *args[] = {&context.render_buffers->buffer.device_pointer,
-                  const_cast<int *>(&buffer_params.full_x),
-                  const_cast<int *>(&buffer_params.full_y),
-                  const_cast<int *>(&buffer_params.width),
-                  const_cast<int *>(&buffer_params.height),
-                  const_cast<int *>(&buffer_params.offset),
-                  const_cast<int *>(&buffer_params.stride),
-                  const_cast<int *>(&buffer_params.pass_stride),
-                  const_cast<int *>(&pass.denoised_offset)};
+  DeviceKernelArguments args(&context.render_buffers->buffer.device_pointer,
+                             &buffer_params.full_x,
+                             &buffer_params.full_y,
+                             &buffer_params.width,
+                             &buffer_params.height,
+                             &buffer_params.offset,
+                             &buffer_params.stride,
+                             &buffer_params.pass_stride,
+                             &pass.denoised_offset);
 
   return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args);
 }
@@ -799,20 +844,20 @@ bool OptiXDevice::denoise_filter_color_postprocess(DenoiseContext &context,
 
   const int work_size = buffer_params.width * buffer_params.height;
 
-  void *args[] = {&context.render_buffers->buffer.device_pointer,
-                  const_cast<int *>(&buffer_params.full_x),
-                  const_cast<int *>(&buffer_params.full_y),
-                  const_cast<int *>(&buffer_params.width),
-                  const_cast<int *>(&buffer_params.height),
-                  const_cast<int *>(&buffer_params.offset),
-                  const_cast<int *>(&buffer_params.stride),
-                  const_cast<int *>(&buffer_params.pass_stride),
-                  const_cast<int *>(&context.num_samples),
-                  const_cast<int *>(&pass.noisy_offset),
-                  const_cast<int *>(&pass.denoised_offset),
-                  const_cast<int *>(&context.pass_sample_count),
-                  const_cast<int *>(&pass.num_components),
-                  const_cast<bool *>(&pass.use_compositing)};
+  DeviceKernelArguments args(&context.render_buffers->buffer.device_pointer,
+                             &buffer_params.full_x,
+                             &buffer_params.full_y,
+                             &buffer_params.width,
+                             &buffer_params.height,
+                             &buffer_params.offset,
+                             &buffer_params.stride,
+                             &buffer_params.pass_stride,
+                             &context.num_samples,
+                             &pass.noisy_offset,
+                             &pass.denoised_offset,
+                             &context.pass_sample_count,
+                             &pass.num_components,
+                             &pass.use_compositing);
 
   return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args);
 }
@@ -870,35 +915,33 @@ bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context)
 
 bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context)
 {
-  if (denoiser_.is_configured && (denoiser_.configured_size.x == context.buffer_params.width &&
-                                  denoiser_.configured_size.y == context.buffer_params.height)) {
+  /* Limit maximum tile size denoiser can be invoked with. */
+  const int2 tile_size = make_int2(min(context.buffer_params.width, 4096),
+                                   min(context.buffer_params.height, 4096));
+
+  if (denoiser_.is_configured &&
+      (denoiser_.configured_size.x == tile_size.x && denoiser_.configured_size.y == tile_size.y)) {
     return true;
   }
 
-  const BufferParams &buffer_params = context.buffer_params;
-
-  OptixDenoiserSizes sizes = {};
   optix_assert(optixDenoiserComputeMemoryResources(
-      denoiser_.optix_denoiser, buffer_params.width, buffer_params.height, &sizes));
-
-  /* Denoiser is invoked on whole images only, so no overlap needed (would be used for tiling). */
-  denoiser_.scratch_size = sizes.withoutOverlapScratchSizeInBytes;
-  denoiser_.scratch_offset = sizes.stateSizeInBytes;
+      denoiser_.optix_denoiser, tile_size.x, tile_size.y, &denoiser_.sizes));
 
   /* Allocate denoiser state if tile size has changed since last setup. */
-  denoiser_.state.alloc_to_device(denoiser_.scratch_offset + denoiser_.scratch_size);
+  denoiser_.state.alloc_to_device(denoiser_.sizes.stateSizeInBytes +
+                                  denoiser_.sizes.withOverlapScratchSizeInBytes);
 
   /* Initialize denoiser state for the current tile size. */
   const OptixResult result = optixDenoiserSetup(
       denoiser_.optix_denoiser,
       0, /* Work around bug in r495 drivers that causes artifacts when denoiser setup is called
             on a stream that is not the default stream */
-      buffer_params.width,
-      buffer_params.height,
+      tile_size.x + denoiser_.sizes.overlapWindowSizeInPixels * 2,
+      tile_size.y + denoiser_.sizes.overlapWindowSizeInPixels * 2,
       denoiser_.state.device_pointer,
-      denoiser_.scratch_offset,
-      denoiser_.state.device_pointer + denoiser_.scratch_offset,
-      denoiser_.scratch_size);
+      denoiser_.sizes.stateSizeInBytes,
+      denoiser_.state.device_pointer + denoiser_.sizes.stateSizeInBytes,
+      denoiser_.sizes.withOverlapScratchSizeInBytes);
   if (result != OPTIX_SUCCESS) {
     set_error("Failed to set up OptiX denoiser");
     return false;
@@ -907,8 +950,7 @@ bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context)
   cuda_assert(cuCtxSynchronize());
 
   denoiser_.is_configured = true;
-  denoiser_.configured_size.x = buffer_params.width;
-  denoiser_.configured_size.y = buffer_params.height;
+  denoiser_.configured_size = tile_size;
 
   return true;
 }
@@ -979,18 +1021,20 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
   guide_layers.albedo = albedo_layer;
   guide_layers.normal = normal_layer;
 
-  optix_assert(optixDenoiserInvoke(denoiser_.optix_denoiser,
-                                   denoiser_.queue.stream(),
-                                   &params,
-                                   denoiser_.state.device_pointer,
-                                   denoiser_.scratch_offset,
-                                   &guide_layers,
-                                   &image_layers,
-                                   1,
-                                   0,
-                                   0,
-                                   denoiser_.state.device_pointer + denoiser_.scratch_offset,
-                                   denoiser_.scratch_size));
+  optix_assert(optixUtilDenoiserInvokeTiled(denoiser_.optix_denoiser,
+                                            denoiser_.queue.stream(),
+                                            &params,
+                                            denoiser_.state.device_pointer,
+                                            denoiser_.sizes.stateSizeInBytes,
+                                            &guide_layers,
+                                            &image_layers,
+                                            1,
+                                            denoiser_.state.device_pointer +
+                                                denoiser_.sizes.stateSizeInBytes,
+                                            denoiser_.sizes.withOverlapScratchSizeInBytes,
+                                            denoiser_.sizes.overlapWindowSizeInPixels,
+                                            denoiser_.configured_size.x,
+                                            denoiser_.configured_size.y));
 
   return true;
 }
@@ -1000,6 +1044,13 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
                                   const OptixBuildInput &build_input,
                                   uint16_t num_motion_steps)
 {
+  /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
+   * from running out of memory (since both original and compacted acceleration structure memory
+   * may be allocated at the same time for the duration of this function). The builds would
+   * otherwise happen on the same CUDA stream anyway. */
+  static thread_mutex mutex;
+  thread_scoped_lock lock(mutex);
+
   const CUDAContextScope scope(this);
 
   const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
@@ -1025,14 +1076,15 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
   optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
 
   /* Allocate required output buffers. */
-  device_only_memory<char> temp_mem(this, "optix temp as build mem");
+  device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
   temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
   if (!temp_mem.device_pointer) {
     /* Make sure temporary memory allocation succeeded. */
     return false;
   }
 
-  device_only_memory<char> &out_data = bvh->as_data;
+  /* Acceleration structure memory has to be allocated on the device (not allowed on the host). */
+  device_only_memory<char> &out_data = *bvh->as_data;
   if (operation == OPTIX_BUILD_OPERATION_BUILD) {
     assert(out_data.device == this);
     out_data.alloc_to_device(sizes.outputSizeInBytes);
@@ -1080,12 +1132,13 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
 
     /* There is no point compacting if the size does not change. */
     if (compacted_size < sizes.outputSizeInBytes) {
-      device_only_memory<char> compacted_data(this, "optix compacted as");
+      device_only_memory<char> compacted_data(this, "optix compacted as", false);
       compacted_data.alloc_to_device(compacted_size);
-      if (!compacted_data.device_pointer)
+      if (!compacted_data.device_pointer) {
         /* Do not compact if memory allocation for compacted acceleration structure fails.
          * Can just use the uncompacted one then, so succeed here regardless. */
         return !have_error();
+      }
 
       optix_assert(optixAccelCompact(
           context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle));
@@ -1096,6 +1149,8 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
 
       std::swap(out_data.device_size, compacted_data.device_size);
       std::swap(out_data.device_pointer, compacted_data.device_pointer);
+      /* Original acceleration structure memory is freed when 'compacted_data' goes out of scope.
+       */
     }
   }
 
@@ -1123,7 +1178,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
       operation = OPTIX_BUILD_OPERATION_UPDATE;
     }
     else {
-      bvh_optix->as_data.free();
+      bvh_optix->as_data->free();
       bvh_optix->traversable_handle = 0;
     }
 
@@ -1178,20 +1233,27 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
               int ka = max(k0 - 1, curve.first_key);
               int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
 
+              index_data[i] = i * 4;
+              float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+
+#  if OPTIX_ABI_VERSION >= 55
+              v[0] = make_float4(keys[ka].x, keys[ka].y, keys[ka].z, curve_radius[ka]);
+              v[1] = make_float4(keys[k0].x, keys[k0].y, keys[k0].z, curve_radius[k0]);
+              v[2] = make_float4(keys[k1].x, keys[k1].y, keys[k1].z, curve_radius[k1]);
+              v[3] = make_float4(keys[kb].x, keys[kb].y, keys[kb].z, curve_radius[kb]);
+#  else
               const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
               const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
               const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
               const float4 pw = make_float4(
                   curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
 
-              /* Convert Catmull-Rom data to Bezier spline. */
+              /* Convert Catmull-Rom data to B-spline. */
               static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
               static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
               static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
               static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
 
-              index_data[i] = i * 4;
-              float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
               v[0] = make_float4(
                   dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
               v[1] = make_float4(
@@ -1200,6 +1262,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
                   dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
               v[3] = make_float4(
                   dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+#  endif
             }
             else {
               BoundBox bounds = BoundBox::empty;
@@ -1241,7 +1304,11 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
       OptixBuildInput build_input = {};
       if (hair->curve_shape == CURVE_THICK) {
         build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+#  if OPTIX_ABI_VERSION >= 55
+        build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM;
+#  else
         build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+#  endif
         build_input.curveArray.numPrimitives = num_segments;
         build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
         build_input.curveArray.numVertices = num_vertices;
@@ -1255,7 +1322,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
       }
       else {
         /* Disable visibility test any-hit program, since it is already checked during
-         * intersection. Those trace calls that require anyhit can force it with a ray flag. */
+         * intersection. Those trace calls that require any-hit can force it with a ray flag. */
         build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
 
         build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
@@ -1339,14 +1406,94 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
         progress.set_error("Failed to build OptiX acceleration structure");
       }
     }
+    else if (geom->geometry_type == Geometry::POINTCLOUD) {
+      /* Build BLAS for points primitives. */
+      PointCloud *const pointcloud = static_cast<PointCloud *const>(geom);
+      const size_t num_points = pointcloud->num_points();
+      if (num_points == 0) {
+        return;
+      }
+
+      size_t num_motion_steps = 1;
+      Attribute *motion_points = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+      if (motion_blur && pointcloud->get_use_motion_blur() && motion_points) {
+        num_motion_steps = pointcloud->get_motion_steps();
+      }
+
+      device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
+      aabb_data.alloc(num_points * num_motion_steps);
+
+      /* Get AABBs for each motion step. */
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        /* The center step for motion vertices is not stored in the attribute. */
+        const float3 *points = pointcloud->get_points().data();
+        const float *radius = pointcloud->get_radius().data();
+        size_t center_step = (num_motion_steps - 1) / 2;
+        if (step != center_step) {
+          size_t attr_offset = (step > center_step) ? step - 1 : step;
+          /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */
+          points = motion_points->data_float3() + attr_offset * num_points;
+        }
+
+        for (size_t i = 0; i < num_points; ++i) {
+          const PointCloud::Point point = pointcloud->get_point(i);
+          BoundBox bounds = BoundBox::empty;
+          point.bounds_grow(points, radius, bounds);
+
+          const size_t index = step * num_points + i;
+          aabb_data[index].minX = bounds.min.x;
+          aabb_data[index].minY = bounds.min.y;
+          aabb_data[index].minZ = bounds.min.z;
+          aabb_data[index].maxX = bounds.max.x;
+          aabb_data[index].maxY = bounds.max.y;
+          aabb_data[index].maxZ = bounds.max.z;
+        }
+      }
+
+      /* Upload AABB data to GPU. */
+      aabb_data.copy_to_device();
+
+      vector<device_ptr> aabb_ptrs;
+      aabb_ptrs.reserve(num_motion_steps);
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        aabb_ptrs.push_back(aabb_data.device_pointer + step * num_points * sizeof(OptixAabb));
+      }
+
+      /* Disable visibility test any-hit program, since it is already checked during
+       * intersection. Those trace calls that require anyhit can force it with a ray flag.
+       * For those, force a single any-hit call, so shadow record-all behavior works correctly. */
+      unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT |
+                                 OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+      OptixBuildInput build_input = {};
+      build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+#  if OPTIX_ABI_VERSION < 23
+      build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+      build_input.aabbArray.numPrimitives = num_points;
+      build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
+      build_input.aabbArray.flags = &build_flags;
+      build_input.aabbArray.numSbtRecords = 1;
+      build_input.aabbArray.primitiveIndexOffset = pointcloud->prim_offset;
+#  else
+      build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+      build_input.customPrimitiveArray.numPrimitives = num_points;
+      build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+      build_input.customPrimitiveArray.flags = &build_flags;
+      build_input.customPrimitiveArray.numSbtRecords = 1;
+      build_input.customPrimitiveArray.primitiveIndexOffset = pointcloud->prim_offset;
+#  endif
+
+      if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+        progress.set_error("Failed to build OptiX acceleration structure");
+      }
+    }
   }
   else {
     unsigned int num_instances = 0;
     unsigned int max_num_instances = 0xFFFFFFFF;
 
-    bvh_optix->as_data.free();
+    bvh_optix->as_data->free();
     bvh_optix->traversable_handle = 0;
-    bvh_optix->motion_transform_data.free();
+    bvh_optix->motion_transform_data->free();
 
     optixDeviceContextGetProperty(context,
                                   OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
@@ -1379,8 +1526,8 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
         }
       }
 
-      assert(bvh_optix->motion_transform_data.device == this);
-      bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
+      assert(bvh_optix->motion_transform_data->device == this);
+      bvh_optix->motion_transform_data->alloc_to_device(total_motion_transform_size);
     }
 
     for (Object *ob : bvh->objects) {
@@ -1422,9 +1569,22 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
           instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
         }
       }
-      else {
+      else if (ob->get_geometry()->geometry_type == Geometry::POINTCLOUD) {
+        /* Use the hit group that has an intersection program for point clouds. */
+        instance.sbtOffset = PG_HITD_POINTCLOUD - PG_HITD;
+
+        /* Also skip point clouds in local trace calls. */
+        instance.visibilityMask |= 4;
+      }
+
+#  if OPTIX_ABI_VERSION < 55
+      /* Cannot disable any-hit program for thick curves, since it needs to filter out end-caps. */
+      else
+#  endif
+      {
         /* Can disable __anyhit__kernel_optix_visibility_test by default (except for thick curves,
          * since it needs to filter out end-caps there).
+
          * It is enabled where necessary (visibility mask exceeds 8 bits or the other any-hit
          * programs like __anyhit__kernel_optix_shadow_all_hit) via OPTIX_RAY_FLAG_ENFORCE_ANYHIT.
          */
@@ -1441,7 +1601,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
         motion_transform_offset = align_up(motion_transform_offset,
                                            OPTIX_TRANSFORM_BYTE_ALIGNMENT);
-        CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
+        CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data->device_pointer +
                                            motion_transform_offset;
         motion_transform_offset += motion_transform_size;
 
@@ -1494,9 +1654,6 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
         cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
         delete[] reinterpret_cast<uint8_t *>(&motion_transform);
 
-        /* Disable instance transform if object uses motion transform already. */
-        instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-
         /* Get traversable handle to motion transform. */
         optixConvertPointerToTraversableHandle(context,
                                                motion_transform_gpu,
@@ -1510,10 +1667,6 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
           /* Set transform matrix. */
           memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
         }
-        else {
-          /* Disable instance transform if geometry already has it applied to vertex data. */
-          instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-        }
       }
     }