PointCloud: Initial rendering support for Workbenchtmp-pointcloud-render

Also includes outline overlays. Removes the temp overlay drawing We make the geometry follow camera like billboards this uses less geometry. Currently we use half octahedron for now. Goal would be to use icospheres. This patch also optimize the case when pointcloud has uniform radius. However we should premultiply the radius prop by the default radius beforehand to avoid a multiplication on CPU. Differential Revision: https://developer.blender.org/D8301
author: Clément Foucault <foucault.clem@gmail.com> 2020-07-15 15:18:30 +0300
committer: Clément Foucault <foucault.clem@gmail.com> 2020-07-15 15:23:35 +0300
commit: e8f8c13d4b76ba587ef7cf33370b286d4fbd36bc (patch)
tree: 371472ae220ad8740b310aaa8f4c5746448302c5 /intern/cycles/device
parent: 0c062a9e082130212447c2b67e8e16b8a2e622d1 (diff)
parent: 44bb73e765a6f79bc14a46449368f83e572d8bad (diff)
10 files changed, 574 insertions, 196 deletions
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index b9bbeb9a25b..0be2c322dfa 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -1760,7 +1760,7 @@ void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
   denoising.render_buffer.samples = rtile.sample;
   denoising.buffer.gpu_temporary_mem = true;
 
-  denoising.run_denoising(&rtile);
+  denoising.run_denoising(rtile);
 }
 
 void CUDADevice::adaptive_sampling_filter(uint filter_sample,
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 9dbb33980b4..407f73e8451 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -209,13 +209,13 @@ bool Device::bind_fallback_display_space_shader(const float width, const float h
     glUseProgram(fallback_shader_program);
     image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture");
     if (image_texture_location < 0) {
-      LOG(ERROR) << "Shader doesn't containt the 'image_texture' uniform.";
+      LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform.";
       return false;
     }
 
     fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen");
     if (fullscreen_location < 0) {
-      LOG(ERROR) << "Shader doesn't containt the 'fullscreen' uniform.";
+      LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform.";
       return false;
     }
 
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index a5833369a17..115b05e3911 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -439,10 +439,10 @@ class Device {
   {
     return 0;
   }
-  virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/)
+  virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
   {
   }
-  virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/)
+  virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
   {
   }
 
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 8f68e66a1b4..ee3a3ddea64 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -182,6 +182,7 @@ class CPUDevice : public Device {
   oidn::DeviceRef oidn_device;
   oidn::FilterRef oidn_filter;
 #endif
+  thread_spin_lock oidn_task_lock;
 
   bool use_split_kernel;
 
@@ -948,12 +949,25 @@ class CPUDevice : public Device {
     }
   }
 
-  void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
+  void denoise_openimagedenoise_buffer(DeviceTask &task,
+                                       float *buffer,
+                                       const size_t offset,
+                                       const size_t stride,
+                                       const size_t x,
+                                       const size_t y,
+                                       const size_t w,
+                                       const size_t h,
+                                       const float scale)
   {
 #ifdef WITH_OPENIMAGEDENOISE
     assert(openimagedenoise_supported());
 
-    /* Only one at a time, since OpenImageDenoise itself is multithreaded. */
+    /* Only one at a time, since OpenImageDenoise itself is multithreaded for full
+     * buffers, and for tiled rendering because creating multiple devices and filters
+     * is slow and memory hungry as well.
+     *
+     * TODO: optimize tiled rendering case, by batching together denoising of many
+     * tiles somehow? */
     static thread_mutex mutex;
     thread_scoped_lock lock(mutex);
 
@@ -964,54 +978,192 @@ class CPUDevice : public Device {
     }
     if (!oidn_filter) {
       oidn_filter = oidn_device.newFilter("RT");
+      oidn_filter.set("hdr", true);
+      oidn_filter.set("srgb", false);
     }
 
-    /* Copy pixels from compute device to CPU (no-op for CPU device). */
-    rtile.buffers->buffer.copy_from_device();
-
     /* Set images with appropriate stride for our interleaved pass storage. */
-    const struct {
+    struct {
       const char *name;
-      int offset;
-    } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR},
-                  {"normal", task.pass_denoising_data + DENOISING_PASS_NORMAL},
-                  {"albedo", task.pass_denoising_data + DENOISING_PASS_ALBEDO},
-                  {"output", 0},
+      const int offset;
+      const bool scale;
+      const bool use;
+      array<float> scaled_buffer;
+    } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR, false, true},
+                  {"albedo",
+                   task.pass_denoising_data + DENOISING_PASS_ALBEDO,
+                   true,
+                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO},
+                  {"normal",
+                   task.pass_denoising_data + DENOISING_PASS_NORMAL,
+                   true,
+                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO_NORMAL},
+                  {"output", 0, false, true},
                   { NULL,
                     0 }};
 
     for (int i = 0; passes[i].name; i++) {
-      const int64_t offset = rtile.offset + rtile.x + rtile.y * rtile.stride;
-      const int64_t buffer_offset = (offset * task.pass_stride + passes[i].offset) * sizeof(float);
-      const int64_t pixel_stride = task.pass_stride * sizeof(float);
-      const int64_t row_stride = rtile.stride * pixel_stride;
+      if (!passes[i].use) {
+        continue;
+      }
 
-      oidn_filter.setImage(passes[i].name,
-                           (char *)rtile.buffer + buffer_offset,
-                           oidn::Format::Float3,
-                           rtile.w,
-                           rtile.h,
-                           0,
-                           pixel_stride,
-                           row_stride);
+      const int64_t pixel_offset = offset + x + y * stride;
+      const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset);
+      const int64_t pixel_stride = task.pass_stride;
+      const int64_t row_stride = stride * pixel_stride;
+
+      if (passes[i].scale && scale != 1.0f) {
+        /* Normalize albedo and normal passes as they are scaled by the number of samples.
+         * For the color passes OIDN will perform auto-exposure making it unnecessary. */
+        array<float> &scaled_buffer = passes[i].scaled_buffer;
+        scaled_buffer.resize(w * h * 3);
+
+        for (int y = 0; y < h; y++) {
+          const float *pass_row = buffer + buffer_offset + y * row_stride;
+          float *scaled_row = scaled_buffer.data() + y * w * 3;
+
+          for (int x = 0; x < w; x++) {
+            scaled_row[x * 3 + 0] = pass_row[x * pixel_stride + 0] * scale;
+            scaled_row[x * 3 + 1] = pass_row[x * pixel_stride + 1] * scale;
+            scaled_row[x * 3 + 2] = pass_row[x * pixel_stride + 2] * scale;
+          }
+        }
+
+        oidn_filter.setImage(
+            passes[i].name, scaled_buffer.data(), oidn::Format::Float3, w, h, 0, 0, 0);
+      }
+      else {
+        oidn_filter.setImage(passes[i].name,
+                             buffer + buffer_offset,
+                             oidn::Format::Float3,
+                             w,
+                             h,
+                             0,
+                             pixel_stride * sizeof(float),
+                             row_stride * sizeof(float));
+      }
     }
 
     /* Execute filter. */
-    oidn_filter.set("hdr", true);
-    oidn_filter.set("srgb", false);
     oidn_filter.commit();
     oidn_filter.execute();
-
-    /* todo: it may be possible to avoid this copy, but we have to ensure that
-     * when other code copies data from the device it doesn't overwrite the
-     * denoiser buffers. */
-    rtile.buffers->buffer.copy_to_device();
 #else
     (void)task;
-    (void)rtile;
+    (void)buffer;
+    (void)offset;
+    (void)stride;
+    (void)x;
+    (void)y;
+    (void)w;
+    (void)h;
+    (void)scale;
 #endif
   }
 
+  void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
+  {
+    if (task.type == DeviceTask::DENOISE_BUFFER) {
+      /* Copy pixels from compute device to CPU (no-op for CPU device). */
+      rtile.buffers->buffer.copy_from_device();
+
+      denoise_openimagedenoise_buffer(task,
+                                      (float *)rtile.buffer,
+                                      rtile.offset,
+                                      rtile.stride,
+                                      rtile.x,
+                                      rtile.y,
+                                      rtile.w,
+                                      rtile.h,
+                                      1.0f / rtile.sample);
+
+      /* todo: it may be possible to avoid this copy, but we have to ensure that
+       * when other code copies data from the device it doesn't overwrite the
+       * denoiser buffers. */
+      rtile.buffers->buffer.copy_to_device();
+    }
+    else {
+      /* Per-tile denoising. */
+      rtile.sample = rtile.start_sample + rtile.num_samples;
+      const float scale = 1.0f / rtile.sample;
+      const float invscale = rtile.sample;
+      const size_t pass_stride = task.pass_stride;
+
+      /* Map neighboring tiles into one buffer for denoising. */
+      RenderTileNeighbors neighbors(rtile);
+      task.map_neighbor_tiles(neighbors, this);
+      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
+      rtile = center_tile;
+
+      /* Calculate size of the tile to denoise (including overlap). The overlap
+       * size was chosen empirically. OpenImageDenoise specifies an overlap size
+       * of 128 but this is significantly bigger than typical tile size. */
+      const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds());
+      const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
+
+      /* Adjacent tiles are in separate memory regions, copy into single buffer. */
+      array<float> merged(rect_size.x * rect_size.y * task.pass_stride);
+
+      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+        RenderTile &ntile = neighbors.tiles[i];
+        if (!ntile.buffer) {
+          continue;
+        }
+
+        const int xmin = max(ntile.x, rect.x);
+        const int ymin = max(ntile.y, rect.y);
+        const int xmax = min(ntile.x + ntile.w, rect.z);
+        const int ymax = min(ntile.y + ntile.h, rect.w);
+
+        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
+        const float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
+
+        const size_t merged_stride = rect_size.x;
+        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
+        float *merged_buffer = merged.data() + merged_offset * pass_stride;
+
+        for (int y = ymin; y < ymax; y++) {
+          for (int x = 0; x < pass_stride * (xmax - xmin); x++) {
+            merged_buffer[x] = tile_buffer[x] * scale;
+          }
+          tile_buffer += ntile.stride * pass_stride;
+          merged_buffer += merged_stride * pass_stride;
+        }
+      }
+
+      /* Denoise */
+      denoise_openimagedenoise_buffer(
+          task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y, 1.0f);
+
+      /* Copy back result from merged buffer. */
+      RenderTile &ntile = neighbors.target;
+      if (ntile.buffer) {
+        const int xmin = max(ntile.x, rect.x);
+        const int ymin = max(ntile.y, rect.y);
+        const int xmax = min(ntile.x + ntile.w, rect.z);
+        const int ymax = min(ntile.y + ntile.h, rect.w);
+
+        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
+        float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
+
+        const size_t merged_stride = rect_size.x;
+        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
+        const float *merged_buffer = merged.data() + merged_offset * pass_stride;
+
+        for (int y = ymin; y < ymax; y++) {
+          for (int x = 0; x < pass_stride * (xmax - xmin); x += pass_stride) {
+            tile_buffer[x + 0] = merged_buffer[x + 0] * invscale;
+            tile_buffer[x + 1] = merged_buffer[x + 1] * invscale;
+            tile_buffer[x + 2] = merged_buffer[x + 2] * invscale;
+          }
+          tile_buffer += ntile.stride * pass_stride;
+          merged_buffer += merged_stride * pass_stride;
+        }
+      }
+
+      task.unmap_neighbor_tiles(neighbors, this);
+    }
+  }
+
   void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
   {
     ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
@@ -1040,7 +1192,7 @@ class CPUDevice : public Device {
     denoising.render_buffer.samples = tile.sample;
     denoising.buffer.gpu_temporary_mem = false;
 
-    denoising.run_denoising(&tile);
+    denoising.run_denoising(tile);
   }
 
   void thread_render(DeviceTask &task)
@@ -1070,10 +1222,23 @@ class CPUDevice : public Device {
       }
     }
 
+    /* NLM denoiser. */
     DenoisingTask *denoising = NULL;
 
+    /* OpenImageDenoise: we can only denoise with one thread at a time, so to
+     * avoid waiting with mutex locks in the denoiser, we let only a single
+     * thread acquire denoising tiles. */
+    uint tile_types = task.tile_types;
+    bool hold_denoise_lock = false;
+    if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+      if (!oidn_task_lock.try_lock()) {
+        tile_types &= ~RenderTile::DENOISE;
+        hold_denoise_lock = true;
+      }
+    }
+
     RenderTile tile;
-    while (task.acquire_tile(this, tile, task.tile_types)) {
+    while (task.acquire_tile(this, tile, tile_types)) {
       if (tile.task == RenderTile::PATH_TRACE) {
         if (use_split_kernel) {
           device_only_memory<uchar> void_buffer(this, "void_buffer");
@@ -1108,6 +1273,10 @@ class CPUDevice : public Device {
       }
     }
 
+    if (hold_denoise_lock) {
+      oidn_task_lock.unlock();
+    }
+
     profiler.remove_state(&kg->profiler);
 
     thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 89de80a5bcd..38c42d15cab 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -71,29 +71,30 @@ DenoisingTask::~DenoisingTask()
   tile_info_mem.free();
 }
 
-void DenoisingTask::set_render_buffer(RenderTile *rtiles)
+void DenoisingTask::set_render_buffer(RenderTileNeighbors &neighbors)
 {
-  for (int i = 0; i < 9; i++) {
-    tile_info->offsets[i] = rtiles[i].offset;
-    tile_info->strides[i] = rtiles[i].stride;
-    tile_info->buffers[i] = rtiles[i].buffer;
+  for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+    RenderTile &rtile = neighbors.tiles[i];
+    tile_info->offsets[i] = rtile.offset;
+    tile_info->strides[i] = rtile.stride;
+    tile_info->buffers[i] = rtile.buffer;
   }
-  tile_info->x[0] = rtiles[3].x;
-  tile_info->x[1] = rtiles[4].x;
-  tile_info->x[2] = rtiles[5].x;
-  tile_info->x[3] = rtiles[5].x + rtiles[5].w;
-  tile_info->y[0] = rtiles[1].y;
-  tile_info->y[1] = rtiles[4].y;
-  tile_info->y[2] = rtiles[7].y;
-  tile_info->y[3] = rtiles[7].y + rtiles[7].h;
-
-  target_buffer.offset = rtiles[9].offset;
-  target_buffer.stride = rtiles[9].stride;
-  target_buffer.ptr = rtiles[9].buffer;
-
-  if (do_prefilter && rtiles[9].buffers) {
+  tile_info->x[0] = neighbors.tiles[3].x;
+  tile_info->x[1] = neighbors.tiles[4].x;
+  tile_info->x[2] = neighbors.tiles[5].x;
+  tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
+  tile_info->y[0] = neighbors.tiles[1].y;
+  tile_info->y[1] = neighbors.tiles[4].y;
+  tile_info->y[2] = neighbors.tiles[7].y;
+  tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
+
+  target_buffer.offset = neighbors.target.offset;
+  target_buffer.stride = neighbors.target.stride;
+  target_buffer.ptr = neighbors.target.buffer;
+
+  if (do_prefilter && neighbors.target.buffers) {
     target_buffer.denoising_output_offset =
-        rtiles[9].buffers->params.get_denoising_prefiltered_offset();
+        neighbors.target.buffers->params.get_denoising_prefiltered_offset();
   }
   else {
     target_buffer.denoising_output_offset = 0;
@@ -320,12 +321,11 @@ void DenoisingTask::reconstruct()
   functions.solve(target_buffer.ptr);
 }
 
-void DenoisingTask::run_denoising(RenderTile *tile)
+void DenoisingTask::run_denoising(RenderTile &tile)
 {
-  RenderTile rtiles[10];
-  rtiles[4] = *tile;
-  functions.map_neighbor_tiles(rtiles);
-  set_render_buffer(rtiles);
+  RenderTileNeighbors neighbors(tile);
+  functions.map_neighbor_tiles(neighbors);
+  set_render_buffer(neighbors);
 
   setup_denoising_buffer();
 
@@ -347,7 +347,7 @@ void DenoisingTask::run_denoising(RenderTile *tile)
     write_buffer();
   }
 
-  functions.unmap_neighbor_tiles(rtiles);
+  functions.unmap_neighbor_tiles(neighbors);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index 4c122e981eb..2c0dc23b44a 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -102,8 +102,8 @@ class DenoisingTask {
                   device_ptr output_ptr)>
         detect_outliers;
     function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature;
-    function<void(RenderTile *rtiles)> map_neighbor_tiles;
-    function<void(RenderTile *rtiles)> unmap_neighbor_tiles;
+    function<void(RenderTileNeighbors &neighbors)> map_neighbor_tiles;
+    function<void(RenderTileNeighbors &neighbors)> unmap_neighbor_tiles;
   } functions;
 
   /* Stores state of the current Reconstruction operation,
@@ -154,7 +154,7 @@ class DenoisingTask {
   DenoisingTask(Device *device, const DeviceTask &task);
   ~DenoisingTask();
 
-  void run_denoising(RenderTile *tile);
+  void run_denoising(RenderTile &tile);
 
   struct DenoiseBuffers {
     int pass_stride;
@@ -179,7 +179,7 @@ class DenoisingTask {
  protected:
   Device *device;
 
-  void set_render_buffer(RenderTile *rtiles);
+  void set_render_buffer(RenderTileNeighbors &neighbors);
   void setup_denoising_buffer();
   void prefilter_shadowing();
   void prefilter_features();
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index fd14bbdccc5..9ea8782d0f0 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -177,8 +177,11 @@ class MultiDevice : public Device {
         return false;
 
     if (requested_features.use_denoising) {
+      /* Only need denoising feature, everything else is unused. */
+      DeviceRequestedFeatures denoising_features;
+      denoising_features.use_denoising = true;
       foreach (SubDevice &sub, denoising_devices)
-        if (!sub.device->load_kernels(requested_features))
+        if (!sub.device->load_kernels(denoising_features))
           return false;
     }
 
@@ -581,20 +584,22 @@ class MultiDevice : public Device {
     return -1;
   }
 
-  void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+  void map_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors)
   {
-    for (int i = 0; i < 9; i++) {
-      if (!tiles[i].buffers) {
+    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+      RenderTile &tile = neighbors.tiles[i];
+
+      if (!tile.buffers) {
         continue;
       }
 
-      device_vector<float> &mem = tiles[i].buffers->buffer;
-      tiles[i].buffer = mem.device_pointer;
+      device_vector<float> &mem = tile.buffers->buffer;
+      tile.buffer = mem.device_pointer;
 
       if (mem.device == this && matching_rendering_and_denoising_devices) {
         /* Skip unnecessary copies in viewport mode (buffer covers the
          * whole image), but still need to fix up the tile device pointer. */
-        map_tile(sub_device, tiles[i]);
+        map_tile(sub_device, tile);
         continue;
       }
 
@@ -607,15 +612,15 @@ class MultiDevice : public Device {
          * also required for the case where a CPU thread is denoising
          * a tile rendered on the GPU. In that case we have to avoid
          * overwriting the buffer being de-noised by the CPU thread. */
-        if (!tiles[i].buffers->map_neighbor_copied) {
-          tiles[i].buffers->map_neighbor_copied = true;
+        if (!tile.buffers->map_neighbor_copied) {
+          tile.buffers->map_neighbor_copied = true;
           mem.copy_from_device();
         }
 
         if (mem.device == this) {
           /* Can re-use memory if tile is already allocated on the sub device. */
-          map_tile(sub_device, tiles[i]);
-          mem.swap_device(sub_device, mem.device_size, tiles[i].buffer);
+          map_tile(sub_device, tile);
+          mem.swap_device(sub_device, mem.device_size, tile.buffer);
         }
         else {
           mem.swap_device(sub_device, 0, 0);
@@ -623,40 +628,42 @@ class MultiDevice : public Device {
 
         mem.copy_to_device();
 
-        tiles[i].buffer = mem.device_pointer;
-        tiles[i].device_size = mem.device_size;
+        tile.buffer = mem.device_pointer;
+        tile.device_size = mem.device_size;
 
         mem.restore_device();
       }
     }
   }
 
-  void unmap_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+  void unmap_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors)
   {
-    device_vector<float> &mem = tiles[9].buffers->buffer;
+    RenderTile &target_tile = neighbors.target;
+    device_vector<float> &mem = target_tile.buffers->buffer;
 
     if (mem.device == this && matching_rendering_and_denoising_devices) {
       return;
     }
 
     /* Copy denoised result back to the host. */
-    mem.swap_device(sub_device, tiles[9].device_size, tiles[9].buffer);
+    mem.swap_device(sub_device, target_tile.device_size, target_tile.buffer);
     mem.copy_from_device();
     mem.restore_device();
 
     /* Copy denoised result to the original device. */
     mem.copy_to_device();
 
-    for (int i = 0; i < 9; i++) {
-      if (!tiles[i].buffers) {
+    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+      RenderTile &tile = neighbors.tiles[i];
+      if (!tile.buffers) {
         continue;
       }
 
-      device_vector<float> &mem = tiles[i].buffers->buffer;
+      device_vector<float> &mem = tile.buffers->buffer;
 
       if (mem.device != sub_device && mem.device != this) {
         /* Free up memory again if it was allocated for the copy above. */
-        mem.swap_device(sub_device, tiles[i].device_size, tiles[i].buffer);
+        mem.swap_device(sub_device, tile.device_size, tile.buffer);
         sub_device->mem_free(mem);
         mem.restore_device();
       }
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index ececca3df53..1cc45983565 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -131,8 +131,12 @@ class OptiXDevice : public CUDADevice {
     PG_RGEN,
     PG_MISS,
     PG_HITD,  // Default hit group
-    PG_HITL,  // __BVH_LOCAL__ hit group
     PG_HITS,  // __SHADOW_RECORD_ALL__ hit group
+    PG_HITL,  // __BVH_LOCAL__ hit group (only used for triangles)
+#  if OPTIX_ABI_VERSION >= 36
+    PG_HITD_MOTION,
+    PG_HITS_MOTION,
+#  endif
 #  ifdef WITH_CYCLES_DEBUG
     PG_EXCP,
 #  endif
@@ -177,6 +181,7 @@ class OptiXDevice : public CUDADevice {
   OptixDeviceContext context = NULL;
 
   OptixModule optix_module = NULL;  // All necessary OptiX kernels are in one module
+  OptixModule builtin_modules[2] = {};
   OptixPipeline pipelines[NUM_PIPELINES] = {};
 
   bool motion_blur = false;
@@ -264,6 +269,9 @@ class OptiXDevice : public CUDADevice {
     // Unload modules
     if (optix_module != NULL)
       optixModuleDestroy(optix_module);
+    for (unsigned int i = 0; i < 2; ++i)
+      if (builtin_modules[i] != NULL)
+        optixModuleDestroy(builtin_modules[i]);
     for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
       if (pipelines[i] != NULL)
         optixPipelineDestroy(pipelines[i]);
@@ -338,6 +346,12 @@ class OptiXDevice : public CUDADevice {
       optixModuleDestroy(optix_module);
       optix_module = NULL;
     }
+    for (unsigned int i = 0; i < 2; ++i) {
+      if (builtin_modules[i] != NULL) {
+        optixModuleDestroy(builtin_modules[i]);
+        builtin_modules[i] = NULL;
+      }
+    }
     for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
       if (pipelines[i] != NULL) {
         optixPipelineDestroy(pipelines[i]);
@@ -369,6 +383,18 @@ class OptiXDevice : public CUDADevice {
 #  endif
     pipeline_options.pipelineLaunchParamsVariableName = "__params";  // See kernel_globals.h
 
+#  if OPTIX_ABI_VERSION >= 36
+    pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
+    if (requested_features.use_hair) {
+      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
+        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+      }
+      else {
+        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+      }
+    }
+#  endif
+
     // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
     // This is necessary since objects may be reported to have motion if the Vector pass is
     // active, but may still need to be rendered without motion blur if that isn't active as well
@@ -442,6 +468,34 @@ class OptiXDevice : public CUDADevice {
         group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
         group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
       }
+
+#  if OPTIX_ABI_VERSION >= 36
+      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
+        OptixBuiltinISOptions builtin_options;
+        builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+        builtin_options.usesMotionBlur = false;
+
+        check_result_optix_ret(optixBuiltinISModuleGet(
+            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
+
+        group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
+        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
+        group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
+        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
+
+        if (motion_blur) {
+          builtin_options.usesMotionBlur = true;
+
+          check_result_optix_ret(optixBuiltinISModuleGet(
+              context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
+
+          group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
+          group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
+          group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
+          group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
+        }
+      }
+#  endif
     }
 
     if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
@@ -493,8 +547,14 @@ class OptiXDevice : public CUDADevice {
     unsigned int trace_css = stack_size[PG_HITD].cssCH;
     // This is based on the maximum of closest-hit and any-hit/intersection programs
     trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
-    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
     trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+#  if OPTIX_ABI_VERSION >= 36
+    trace_css = std::max(trace_css,
+                         stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
+    trace_css = std::max(trace_css,
+                         stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+#  endif
 
     OptixPipelineLinkOptions link_options;
     link_options.maxTraceDepth = 1;
@@ -503,17 +563,23 @@ class OptiXDevice : public CUDADevice {
 #  else
     link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
 #  endif
-    link_options.overrideUsesMotionBlur = pipeline_options.usesMotionBlur;
+#  if OPTIX_ABI_VERSION < 24
+    link_options.overrideUsesMotionBlur = motion_blur;
+#  endif
 
     {  // Create path tracing pipeline
       OptixProgramGroup pipeline_groups[] = {
-          groups[PG_RGEN],
-          groups[PG_MISS],
-          groups[PG_HITD],
-          groups[PG_HITS],
-          groups[PG_HITL],
+        groups[PG_RGEN],
+        groups[PG_MISS],
+        groups[PG_HITD],
+        groups[PG_HITS],
+        groups[PG_HITL],
+#  if OPTIX_ABI_VERSION >= 36
+        groups[PG_HITD_MOTION],
+        groups[PG_HITS_MOTION],
+#  endif
 #  ifdef WITH_CYCLES_DEBUG
-          groups[PG_EXCP],
+        groups[PG_EXCP],
 #  endif
       };
       check_result_optix_ret(
@@ -530,8 +596,8 @@ class OptiXDevice : public CUDADevice {
       const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
 
       // Set stack size depending on pipeline options
-      check_result_optix_ret(optixPipelineSetStackSize(
-          pipelines[PIP_PATH_TRACE], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2)));
+      check_result_optix_ret(
+          optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE], 0, 0, css, (motion_blur ? 3 : 2)));
     }
 
     // Only need to create shader evaluation pipeline if one of these features is used:
@@ -541,15 +607,19 @@ class OptiXDevice : public CUDADevice {
 
     if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
       OptixProgramGroup pipeline_groups[] = {
-          groups[PG_BAKE],
-          groups[PG_DISP],
-          groups[PG_BACK],
-          groups[PG_MISS],
-          groups[PG_HITD],
-          groups[PG_HITS],
-          groups[PG_HITL],
+        groups[PG_BAKE],
+        groups[PG_DISP],
+        groups[PG_BACK],
+        groups[PG_MISS],
+        groups[PG_HITD],
+        groups[PG_HITS],
+        groups[PG_HITL],
+#  if OPTIX_ABI_VERSION >= 36
+        groups[PG_HITD_MOTION],
+        groups[PG_HITS_MOTION],
+#  endif
 #  ifdef WITH_CYCLES_DEBUG
-          groups[PG_EXCP],
+        groups[PG_EXCP],
 #  endif
       };
       check_result_optix_ret(
@@ -672,7 +742,11 @@ class OptiXDevice : public CUDADevice {
       sbt_params.missRecordCount = 1;
       sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
       sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITL, PG_HITS
+#  if OPTIX_ABI_VERSION >= 36
+      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
+#  else
+      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
+#  endif
 
       // Launch the ray generation program
       check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
@@ -727,19 +801,18 @@ class OptiXDevice : public CUDADevice {
       //   0 1 2
       //   3 4 5
       //   6 7 8  9
-      RenderTile rtiles[10];
-      rtiles[4] = rtile;
-      task.map_neighbor_tiles(rtiles, this);
-      rtile = rtiles[4];  // Tile may have been modified by mapping code
+      RenderTileNeighbors neighbors(rtile);
+      task.map_neighbor_tiles(neighbors, this);
+      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
+      RenderTile &target_tile = neighbors.target;
+      rtile = center_tile;  // Tile may have been modified by mapping code
 
       // Calculate size of the tile to denoise (including overlap)
-      int4 rect = make_int4(
-          rtiles[4].x, rtiles[4].y, rtiles[4].x + rtiles[4].w, rtiles[4].y + rtiles[4].h);
+      int4 rect = center_tile.bounds();
       // Overlap between tiles has to be at least 64 pixels
       // TODO(pmours): Query this value from OptiX
       rect = rect_expand(rect, 64);
-      int4 clip_rect = make_int4(
-          rtiles[3].x, rtiles[1].y, rtiles[5].x + rtiles[5].w, rtiles[7].y + rtiles[7].h);
+      int4 clip_rect = neighbors.bounds();
       rect = rect_clip(rect, clip_rect);
       int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
       int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
@@ -760,14 +833,14 @@ class OptiXDevice : public CUDADevice {
       device_only_memory<float> input(this, "denoiser input");
       device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_WRITE);
 
-      if ((!rtiles[0].buffer || rtiles[0].buffer == rtile.buffer) &&
-          (!rtiles[1].buffer || rtiles[1].buffer == rtile.buffer) &&
-          (!rtiles[2].buffer || rtiles[2].buffer == rtile.buffer) &&
-          (!rtiles[3].buffer || rtiles[3].buffer == rtile.buffer) &&
-          (!rtiles[5].buffer || rtiles[5].buffer == rtile.buffer) &&
-          (!rtiles[6].buffer || rtiles[6].buffer == rtile.buffer) &&
-          (!rtiles[7].buffer || rtiles[7].buffer == rtile.buffer) &&
-          (!rtiles[8].buffer || rtiles[8].buffer == rtile.buffer)) {
+      bool contiguous_memory = true;
+      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+        if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) {
+          contiguous_memory = false;
+        }
+      }
+
+      if (contiguous_memory) {
         // Tiles are in continous memory, so can just subtract overlap offset
         input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
         // Stride covers the whole width of the image and not just a single tile
@@ -782,19 +855,19 @@ class OptiXDevice : public CUDADevice {
         input_stride *= rect_size.x;
 
         TileInfo *tile_info = tile_info_mem.alloc(1);
-        for (int i = 0; i < 9; i++) {
-          tile_info->offsets[i] = rtiles[i].offset;
-          tile_info->strides[i] = rtiles[i].stride;
-          tile_info->buffers[i] = rtiles[i].buffer;
+        for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+          tile_info->offsets[i] = neighbors.tiles[i].offset;
+          tile_info->strides[i] = neighbors.tiles[i].stride;
+          tile_info->buffers[i] = neighbors.tiles[i].buffer;
         }
-        tile_info->x[0] = rtiles[3].x;
-        tile_info->x[1] = rtiles[4].x;
-        tile_info->x[2] = rtiles[5].x;
-        tile_info->x[3] = rtiles[5].x + rtiles[5].w;
-        tile_info->y[0] = rtiles[1].y;
-        tile_info->y[1] = rtiles[4].y;
-        tile_info->y[2] = rtiles[7].y;
-        tile_info->y[3] = rtiles[7].y + rtiles[7].h;
+        tile_info->x[0] = neighbors.tiles[3].x;
+        tile_info->x[1] = neighbors.tiles[4].x;
+        tile_info->x[2] = neighbors.tiles[5].x;
+        tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
+        tile_info->y[0] = neighbors.tiles[1].y;
+        tile_info->y[1] = neighbors.tiles[4].y;
+        tile_info->y[2] = neighbors.tiles[7].y;
+        tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
         tile_info_mem.copy_to_device();
 
         void *args[] = {
@@ -804,7 +877,7 @@ class OptiXDevice : public CUDADevice {
 
 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
       device_only_memory<float> input_rgb(this, "denoiser input rgb");
-      input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.optix_input_passes);
+      input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
 
       void *input_args[] = {&input_rgb.device_pointer,
                             &input_ptr,
@@ -813,7 +886,7 @@ class OptiXDevice : public CUDADevice {
                             &input_stride,
                             &task.pass_stride,
                             const_cast<int *>(pass_offset),
-                            &task.denoising.optix_input_passes,
+                            &task.denoising.input_passes,
                             &rtile.sample};
       launch_filter_kernel(
           "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
@@ -824,7 +897,7 @@ class OptiXDevice : public CUDADevice {
 #  endif
 
       const bool recreate_denoiser = (denoiser == NULL) ||
-                                     (task.denoising.optix_input_passes != denoiser_input_passes);
+                                     (task.denoising.input_passes != denoiser_input_passes);
       if (recreate_denoiser) {
         // Destroy existing handle before creating new one
         if (denoiser != NULL) {
@@ -833,23 +906,29 @@ class OptiXDevice : public CUDADevice {
 
         // Create OptiX denoiser handle on demand when it is first used
         OptixDenoiserOptions denoiser_options;
-        assert(task.denoising.optix_input_passes >= 1 && task.denoising.optix_input_passes <= 3);
+        assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3);
         denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
-            OPTIX_DENOISER_INPUT_RGB + (task.denoising.optix_input_passes - 1));
+            OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1));
+#  if OPTIX_ABI_VERSION < 28
         denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
+#  endif
         check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
         check_result_optix_ret(
             optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
 
         // OptiX denoiser handle was created with the requested number of input passes
-        denoiser_input_passes = task.denoising.optix_input_passes;
+        denoiser_input_passes = task.denoising.input_passes;
       }
 
       OptixDenoiserSizes sizes = {};
       check_result_optix_ret(
           optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
 
+#  if OPTIX_ABI_VERSION < 28
       const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
+#  else
+      const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
+#  endif
       const size_t scratch_offset = sizes.stateSizeInBytes;
 
       // Allocate denoiser state if tile size has changed since last setup
@@ -897,10 +976,10 @@ class OptiXDevice : public CUDADevice {
       int2 output_offset = overlap_offset;
       overlap_offset = make_int2(0, 0);  // Not supported by denoiser API, so apply manually
 #  else
-      output_layers[0].data = rtiles[9].buffer + pixel_offset;
-      output_layers[0].width = rtiles[9].w;
-      output_layers[0].height = rtiles[9].h;
-      output_layers[0].rowStrideInBytes = rtiles[9].stride * pixel_stride;
+      output_layers[0].data = target_tile.buffer + pixel_offset;
+      output_layers[0].width = target_tile.w;
+      output_layers[0].height = target_tile.h;
+      output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride;
       output_layers[0].pixelStrideInBytes = pixel_stride;
 #  endif
       output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
@@ -913,7 +992,7 @@ class OptiXDevice : public CUDADevice {
                                                  denoiser_state.device_pointer,
                                                  scratch_offset,
                                                  input_layers,
-                                                 task.denoising.optix_input_passes,
+                                                 task.denoising.input_passes,
                                                  overlap_offset.x,
                                                  overlap_offset.y,
                                                  output_layers,
@@ -922,26 +1001,26 @@ class OptiXDevice : public CUDADevice {
 
 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
       void *output_args[] = {&input_ptr,
-                             &rtiles[9].buffer,
+                             &target_tile.buffer,
                              &output_offset.x,
                              &output_offset.y,
                              &rect_size.x,
                              &rect_size.y,
-                             &rtiles[9].x,
-                             &rtiles[9].y,
-                             &rtiles[9].w,
-                             &rtiles[9].h,
-                             &rtiles[9].offset,
-                             &rtiles[9].stride,
+                             &target_tile.x,
+                             &target_tile.y,
+                             &target_tile.w,
+                             &target_tile.h,
+                             &target_tile.offset,
+                             &target_tile.stride,
                              &task.pass_stride,
                              &rtile.sample};
       launch_filter_kernel(
-          "kernel_cuda_filter_convert_from_rgb", rtiles[9].w, rtiles[9].h, output_args);
+          "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args);
 #  endif
 
       check_result_cuda_ret(cuStreamSynchronize(0));
 
-      task.unmap_neighbor_tiles(rtiles, this);
+      task.unmap_neighbor_tiles(neighbors, this);
     }
     else {
       // Run CUDA denoising kernels
@@ -993,7 +1072,11 @@ class OptiXDevice : public CUDADevice {
       sbt_params.missRecordCount = 1;
       sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
       sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITL, PG_HITS
+#  if OPTIX_ABI_VERSION >= 36
+      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
+#  else
+      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
+#  endif
 
       check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
                                      cuda_stream[thread_index],
@@ -1070,7 +1153,7 @@ class OptiXDevice : public CUDADevice {
                                            &build_input,
                                            1,
                                            temp_mem.device_pointer,
-                                           temp_mem.device_size,
+                                           sizes.tempSizeInBytes,
                                            out_data,
                                            sizes.outputSizeInBytes,
                                            &out_handle,
@@ -1142,7 +1225,6 @@ class OptiXDevice : public CUDADevice {
           continue;
         }
 
-        const size_t num_curves = hair->num_curves();
         const size_t num_segments = hair->num_segments();
 
         size_t num_motion_steps = 1;
@@ -1152,7 +1234,18 @@ class OptiXDevice : public CUDADevice {
         }
 
         device_vector<OptixAabb> aabb_data(this, "temp_aabb_data", MEM_READ_ONLY);
-        aabb_data.alloc(num_segments * num_motion_steps);
+#  if OPTIX_ABI_VERSION >= 36
+        device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
+        device_vector<float4> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
+        // Four control points for each curve segment
+        const size_t num_vertices = num_segments * 4;
+        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+          index_data.alloc(num_segments);
+          vertex_data.alloc(num_vertices * num_motion_steps);
+        }
+        else
+#  endif
+          aabb_data.alloc(num_segments * num_motion_steps);
 
         // Get AABBs for each motion step
         for (size_t step = 0; step < num_motion_steps; ++step) {
@@ -1165,44 +1258,127 @@ class OptiXDevice : public CUDADevice {
             keys = motion_keys->data_float3() + attr_offset * hair->curve_keys.size();
           }
 
-          size_t i = step * num_segments;
-          for (size_t j = 0; j < num_curves; ++j) {
-            const Hair::Curve c = hair->get_curve(j);
-
-            for (size_t k = 0; k < c.num_segments(); ++i, ++k) {
-              BoundBox bounds = BoundBox::empty;
-              c.bounds_grow(k, keys, hair->curve_radius.data(), bounds);
-
-              aabb_data[i].minX = bounds.min.x;
-              aabb_data[i].minY = bounds.min.y;
-              aabb_data[i].minZ = bounds.min.z;
-              aabb_data[i].maxX = bounds.max.x;
-              aabb_data[i].maxY = bounds.max.y;
-              aabb_data[i].maxZ = bounds.max.z;
+          for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+            const Hair::Curve curve = hair->get_curve(j);
+
+            for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+#  if OPTIX_ABI_VERSION >= 36
+              if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+                int k0 = curve.first_key + segment;
+                int k1 = k0 + 1;
+                int ka = max(k0 - 1, curve.first_key);
+                int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
+
+                const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
+                const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
+                const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
+                const float4 pw = make_float4(hair->curve_radius[ka],
+                                              hair->curve_radius[k0],
+                                              hair->curve_radius[k1],
+                                              hair->curve_radius[kb]);
+
+                // Convert Catmull-Rom data to Bezier spline
+                static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
+                static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
+                static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
+                static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
+
+                index_data[i] = i * 4;
+                float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+                v[0] = make_float4(
+                    dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
+                v[1] = make_float4(
+                    dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
+                v[2] = make_float4(
+                    dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
+                v[3] = make_float4(
+                    dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+              }
+              else
+#  endif
+              {
+                BoundBox bounds = BoundBox::empty;
+                curve.bounds_grow(segment, keys, hair->curve_radius.data(), bounds);
+
+                const size_t index = step * num_segments + i;
+                aabb_data[index].minX = bounds.min.x;
+                aabb_data[index].minY = bounds.min.y;
+                aabb_data[index].minZ = bounds.min.z;
+                aabb_data[index].maxX = bounds.max.x;
+                aabb_data[index].maxY = bounds.max.y;
+                aabb_data[index].maxZ = bounds.max.z;
+              }
             }
           }
         }
 
         // Upload AABB data to GPU
         aabb_data.copy_to_device();
+#  if OPTIX_ABI_VERSION >= 36
+        index_data.copy_to_device();
+        vertex_data.copy_to_device();
+#  endif
 
         vector<device_ptr> aabb_ptrs;
         aabb_ptrs.reserve(num_motion_steps);
+#  if OPTIX_ABI_VERSION >= 36
+        vector<device_ptr> width_ptrs;
+        vector<device_ptr> vertex_ptrs;
+        width_ptrs.reserve(num_motion_steps);
+        vertex_ptrs.reserve(num_motion_steps);
+#  endif
         for (size_t step = 0; step < num_motion_steps; ++step) {
           aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+#  if OPTIX_ABI_VERSION >= 36
+          const device_ptr base_ptr = vertex_data.device_pointer +
+                                      step * num_vertices * sizeof(float4);
+          width_ptrs.push_back(base_ptr + 3 * sizeof(float));  // Offset by vertex size
+          vertex_ptrs.push_back(base_ptr);
+#  endif
         }
 
-        // Disable visibility test anyhit program, since it is already checked during intersection
-        // Those trace calls that require anyhit can force it with OPTIX_RAY_FLAG_ENFORCE_ANYHIT
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+        // Force a single any-hit call, so shadow record-all behavior works correctly
+        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
         OptixBuildInput build_input = {};
-        build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
-        build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
-        build_input.aabbArray.numPrimitives = num_segments;
-        build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
-        build_input.aabbArray.flags = &build_flags;
-        build_input.aabbArray.numSbtRecords = 1;
-        build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  if OPTIX_ABI_VERSION >= 36
+        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+          build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+          build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+          build_input.curveArray.numPrimitives = num_segments;
+          build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+          build_input.curveArray.numVertices = num_vertices;
+          build_input.curveArray.vertexStrideInBytes = sizeof(float4);
+          build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
+          build_input.curveArray.widthStrideInBytes = sizeof(float4);
+          build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
+          build_input.curveArray.indexStrideInBytes = sizeof(int);
+          build_input.curveArray.flag = build_flags;
+          build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
+        }
+        else
+#  endif
+        {
+          // Disable visibility test any-hit program, since it is already checked during
+          // intersection. Those trace calls that require anyhit can force it with a ray flag.
+          build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+          build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+#  if OPTIX_ABI_VERSION < 23
+          build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+          build_input.aabbArray.numPrimitives = num_segments;
+          build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
+          build_input.aabbArray.flags = &build_flags;
+          build_input.aabbArray.numSbtRecords = 1;
+          build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  else
+          build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+          build_input.customPrimitiveArray.numPrimitives = num_segments;
+          build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+          build_input.customPrimitiveArray.flags = &build_flags;
+          build_input.customPrimitiveArray.numSbtRecords = 1;
+          build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  endif
+        }
 
         // Allocate memory for new BLAS and build it
         OptixTraversableHandle handle;
@@ -1257,8 +1433,8 @@ class OptiXDevice : public CUDADevice {
           vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
         }
 
-        // No special build flags for triangle primitives
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_NONE;
+        // Force a single any-hit call, so shadow record-all behavior works correctly
+        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
         OptixBuildInput build_input = {};
         build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
         build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
@@ -1324,9 +1500,26 @@ class OptiXDevice : public CUDADevice {
       // Set user instance ID to object index
       instance.instanceId = ob->get_device_index();
 
-      // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
-      // See 'scene_intersect_volume' in bvh.h
-      instance.visibilityMask = (ob->geometry->has_volume ? 3 : 1);
+      // Have to have at least one bit in the mask, or else instance would always be culled
+      instance.visibilityMask = 1;
+
+      if (ob->geometry->has_volume) {
+        // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
+        instance.visibilityMask |= 2;
+      }
+
+      if (ob->geometry->type == Geometry::HAIR) {
+        // Same applies to curves (so they can be skipped in local trace calls)
+        instance.visibilityMask |= 4;
+
+#  if OPTIX_ABI_VERSION >= 36
+        if (motion_blur && ob->geometry->has_motion_blur() && DebugFlags().optix.curves_api &&
+            static_cast<const Hair *>(ob->geometry)->curve_shape == CURVE_THICK) {
+          // Select between motion blur and non-motion blur built-in intersection module
+          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+        }
+#  endif
+      }
 
       // Insert motion traversable if object has motion
       if (motion_blur && ob->use_motion()) {
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 600973b8100..fd380788282 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -29,6 +29,7 @@ CCL_NAMESPACE_BEGIN
 class Device;
 class RenderBuffers;
 class RenderTile;
+class RenderTileNeighbors;
 class Tile;
 
 enum DenoiserType {
@@ -41,6 +42,14 @@ enum DenoiserType {
   DENOISER_ALL = ~0,
 };
 
+enum DenoiserInput {
+  DENOISER_INPUT_RGB = 1,
+  DENOISER_INPUT_RGB_ALBEDO = 2,
+  DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3,
+
+  DENOISER_INPUT_NUM,
+};
+
 typedef int DenoiserTypeMask;
 
 class DenoiseParams {
@@ -72,10 +81,10 @@ class DenoiseParams {
   /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
   bool clamp_input;
 
-  /** Optix Denoiser **/
+  /** OIDN/Optix Denoiser **/
 
-  /* Passes handed over to the OptiX denoiser (default to color + albedo). */
-  int optix_input_passes;
+  /* Passes handed over to the OIDN/OptiX denoiser (default to color + albedo). */
+  DenoiserInput input_passes;
 
   DenoiseParams()
   {
@@ -91,7 +100,7 @@ class DenoiseParams {
     neighbor_frames = 2;
     clamp_input = true;
 
-    optix_input_passes = 2;
+    input_passes = DENOISER_INPUT_RGB_ALBEDO_NORMAL;
 
     start_sample = 0;
   }
@@ -150,8 +159,8 @@ class DeviceTask {
   function<void(RenderTile &)> update_tile_sample;
   function<void(RenderTile &)> release_tile;
   function<bool()> get_cancel;
-  function<void(RenderTile *, Device *)> map_neighbor_tiles;
-  function<void(RenderTile *, Device *)> unmap_neighbor_tiles;
+  function<void(RenderTileNeighbors &, Device *)> map_neighbor_tiles;
+  function<void(RenderTileNeighbors &, Device *)> unmap_neighbor_tiles;
 
   uint tile_types;
   DenoiseParams denoising;
diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
index 8c94815b193..e851749949d 100644
--- a/intern/cycles/device/opencl/device_opencl_impl.cpp
+++ b/intern/cycles/device/opencl/device_opencl_impl.cpp
@@ -1850,7 +1850,7 @@ void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
   denoising.render_buffer.samples = rtile.sample;
   denoising.buffer.gpu_temporary_mem = true;
 
-  denoising.run_denoising(&rtile);
+  denoising.run_denoising(rtile);
 }
 
 void OpenCLDevice::shader(DeviceTask &task)
author	Clément Foucault <foucault.clem@gmail.com>	2020-07-15 15:18:30 +0300
committer	Clément Foucault <foucault.clem@gmail.com>	2020-07-15 15:23:35 +0300
commit	e8f8c13d4b76ba587ef7cf33370b286d4fbd36bc (patch)
tree	371472ae220ad8740b310aaa8f4c5746448302c5 /intern/cycles/device
parent	0c062a9e082130212447c2b67e8e16b8a2e622d1 (diff)
parent	44bb73e765a6f79bc14a46449368f83e572d8bad (diff)