10 files changed, 592 insertions, 201 deletions
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index b9bbeb9a25b..3a2eb8df95b 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -383,11 +383,24 @@ string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_featu
       }
     }
 
-    const string ptx = path_get(string_printf("lib/%s_compute_%d%d.ptx", name, major, minor));
-    VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
-    if (path_exists(ptx)) {
-      VLOG(1) << "Using precompiled kernel.";
-      return ptx;
+    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
+    int ptx_major = major, ptx_minor = minor;
+    while (ptx_major >= 3) {
+      const string ptx = path_get(
+          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+      if (path_exists(ptx)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return ptx;
+      }
+
+      if (ptx_minor > 0) {
+        ptx_minor--;
+      }
+      else {
+        ptx_major--;
+        ptx_minor = 9;
+      }
     }
   }
 
@@ -1760,7 +1773,7 @@ void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
   denoising.render_buffer.samples = rtile.sample;
   denoising.buffer.gpu_temporary_mem = true;
 
-  denoising.run_denoising(&rtile);
+  denoising.run_denoising(rtile);
 }
 
 void CUDADevice::adaptive_sampling_filter(uint filter_sample,
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 9dbb33980b4..407f73e8451 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -209,13 +209,13 @@ bool Device::bind_fallback_display_space_shader(const float width, const float h
     glUseProgram(fallback_shader_program);
     image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture");
     if (image_texture_location < 0) {
-      LOG(ERROR) << "Shader doesn't containt the 'image_texture' uniform.";
+      LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform.";
       return false;
     }
 
     fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen");
     if (fullscreen_location < 0) {
-      LOG(ERROR) << "Shader doesn't containt the 'fullscreen' uniform.";
+      LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform.";
       return false;
     }
 
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index a5833369a17..115b05e3911 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -439,10 +439,10 @@ class Device {
   {
     return 0;
   }
-  virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/)
+  virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
   {
   }
-  virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/)
+  virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
   {
   }
 
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 8f68e66a1b4..ee3a3ddea64 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -182,6 +182,7 @@ class CPUDevice : public Device {
   oidn::DeviceRef oidn_device;
   oidn::FilterRef oidn_filter;
 #endif
+  thread_spin_lock oidn_task_lock;
 
   bool use_split_kernel;
 
@@ -948,12 +949,25 @@ class CPUDevice : public Device {
     }
   }
 
-  void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
+  void denoise_openimagedenoise_buffer(DeviceTask &task,
+                                       float *buffer,
+                                       const size_t offset,
+                                       const size_t stride,
+                                       const size_t x,
+                                       const size_t y,
+                                       const size_t w,
+                                       const size_t h,
+                                       const float scale)
   {
 #ifdef WITH_OPENIMAGEDENOISE
     assert(openimagedenoise_supported());
 
-    /* Only one at a time, since OpenImageDenoise itself is multithreaded. */
+    /* Only one at a time, since OpenImageDenoise itself is multithreaded for full
+     * buffers, and for tiled rendering because creating multiple devices and filters
+     * is slow and memory hungry as well.
+     *
+     * TODO: optimize tiled rendering case, by batching together denoising of many
+     * tiles somehow? */
     static thread_mutex mutex;
     thread_scoped_lock lock(mutex);
 
@@ -964,54 +978,192 @@ class CPUDevice : public Device {
     }
     if (!oidn_filter) {
       oidn_filter = oidn_device.newFilter("RT");
+      oidn_filter.set("hdr", true);
+      oidn_filter.set("srgb", false);
     }
 
-    /* Copy pixels from compute device to CPU (no-op for CPU device). */
-    rtile.buffers->buffer.copy_from_device();
-
     /* Set images with appropriate stride for our interleaved pass storage. */
-    const struct {
+    struct {
       const char *name;
-      int offset;
-    } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR},
-                  {"normal", task.pass_denoising_data + DENOISING_PASS_NORMAL},
-                  {"albedo", task.pass_denoising_data + DENOISING_PASS_ALBEDO},
-                  {"output", 0},
+      const int offset;
+      const bool scale;
+      const bool use;
+      array<float> scaled_buffer;
+    } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR, false, true},
+                  {"albedo",
+                   task.pass_denoising_data + DENOISING_PASS_ALBEDO,
+                   true,
+                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO},
+                  {"normal",
+                   task.pass_denoising_data + DENOISING_PASS_NORMAL,
+                   true,
+                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO_NORMAL},
+                  {"output", 0, false, true},
                   { NULL,
                     0 }};
 
     for (int i = 0; passes[i].name; i++) {
-      const int64_t offset = rtile.offset + rtile.x + rtile.y * rtile.stride;
-      const int64_t buffer_offset = (offset * task.pass_stride + passes[i].offset) * sizeof(float);
-      const int64_t pixel_stride = task.pass_stride * sizeof(float);
-      const int64_t row_stride = rtile.stride * pixel_stride;
+      if (!passes[i].use) {
+        continue;
+      }
 
-      oidn_filter.setImage(passes[i].name,
-                           (char *)rtile.buffer + buffer_offset,
-                           oidn::Format::Float3,
-                           rtile.w,
-                           rtile.h,
-                           0,
-                           pixel_stride,
-                           row_stride);
+      const int64_t pixel_offset = offset + x + y * stride;
+      const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset);
+      const int64_t pixel_stride = task.pass_stride;
+      const int64_t row_stride = stride * pixel_stride;
+
+      if (passes[i].scale && scale != 1.0f) {
+        /* Normalize albedo and normal passes as they are scaled by the number of samples.
+         * For the color passes OIDN will perform auto-exposure making it unnecessary. */
+        array<float> &scaled_buffer = passes[i].scaled_buffer;
+        scaled_buffer.resize(w * h * 3);
+
+        for (int y = 0; y < h; y++) {
+          const float *pass_row = buffer + buffer_offset + y * row_stride;
+          float *scaled_row = scaled_buffer.data() + y * w * 3;
+
+          for (int x = 0; x < w; x++) {
+            scaled_row[x * 3 + 0] = pass_row[x * pixel_stride + 0] * scale;
+            scaled_row[x * 3 + 1] = pass_row[x * pixel_stride + 1] * scale;
+            scaled_row[x * 3 + 2] = pass_row[x * pixel_stride + 2] * scale;
+          }
+        }
+
+        oidn_filter.setImage(
+            passes[i].name, scaled_buffer.data(), oidn::Format::Float3, w, h, 0, 0, 0);
+      }
+      else {
+        oidn_filter.setImage(passes[i].name,
+                             buffer + buffer_offset,
+                             oidn::Format::Float3,
+                             w,
+                             h,
+                             0,
+                             pixel_stride * sizeof(float),
+                             row_stride * sizeof(float));
+      }
     }
 
     /* Execute filter. */
-    oidn_filter.set("hdr", true);
-    oidn_filter.set("srgb", false);
     oidn_filter.commit();
     oidn_filter.execute();
-
-    /* todo: it may be possible to avoid this copy, but we have to ensure that
-     * when other code copies data from the device it doesn't overwrite the
-     * denoiser buffers. */
-    rtile.buffers->buffer.copy_to_device();
 #else
     (void)task;
-    (void)rtile;
+    (void)buffer;
+    (void)offset;
+    (void)stride;
+    (void)x;
+    (void)y;
+    (void)w;
+    (void)h;
+    (void)scale;
 #endif
   }
 
+  void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
+  {
+    if (task.type == DeviceTask::DENOISE_BUFFER) {
+      /* Copy pixels from compute device to CPU (no-op for CPU device). */
+      rtile.buffers->buffer.copy_from_device();
+
+      denoise_openimagedenoise_buffer(task,
+                                      (float *)rtile.buffer,
+                                      rtile.offset,
+                                      rtile.stride,
+                                      rtile.x,
+                                      rtile.y,
+                                      rtile.w,
+                                      rtile.h,
+                                      1.0f / rtile.sample);
+
+      /* todo: it may be possible to avoid this copy, but we have to ensure that
+       * when other code copies data from the device it doesn't overwrite the
+       * denoiser buffers. */
+      rtile.buffers->buffer.copy_to_device();
+    }
+    else {
+      /* Per-tile denoising. */
+      rtile.sample = rtile.start_sample + rtile.num_samples;
+      const float scale = 1.0f / rtile.sample;
+      const float invscale = rtile.sample;
+      const size_t pass_stride = task.pass_stride;
+
+      /* Map neighboring tiles into one buffer for denoising. */
+      RenderTileNeighbors neighbors(rtile);
+      task.map_neighbor_tiles(neighbors, this);
+      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
+      rtile = center_tile;
+
+      /* Calculate size of the tile to denoise (including overlap). The overlap
+       * size was chosen empirically. OpenImageDenoise specifies an overlap size
+       * of 128 but this is significantly bigger than typical tile size. */
+      const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds());
+      const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
+
+      /* Adjacent tiles are in separate memory regions, copy into single buffer. */
+      array<float> merged(rect_size.x * rect_size.y * task.pass_stride);
+
+      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+        RenderTile &ntile = neighbors.tiles[i];
+        if (!ntile.buffer) {
+          continue;
+        }
+
+        const int xmin = max(ntile.x, rect.x);
+        const int ymin = max(ntile.y, rect.y);
+        const int xmax = min(ntile.x + ntile.w, rect.z);
+        const int ymax = min(ntile.y + ntile.h, rect.w);
+
+        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
+        const float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
+
+        const size_t merged_stride = rect_size.x;
+        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
+        float *merged_buffer = merged.data() + merged_offset * pass_stride;
+
+        for (int y = ymin; y < ymax; y++) {
+          for (int x = 0; x < pass_stride * (xmax - xmin); x++) {
+            merged_buffer[x] = tile_buffer[x] * scale;
+          }
+          tile_buffer += ntile.stride * pass_stride;
+          merged_buffer += merged_stride * pass_stride;
+        }
+      }
+
+      /* Denoise */
+      denoise_openimagedenoise_buffer(
+          task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y, 1.0f);
+
+      /* Copy back result from merged buffer. */
+      RenderTile &ntile = neighbors.target;
+      if (ntile.buffer) {
+        const int xmin = max(ntile.x, rect.x);
+        const int ymin = max(ntile.y, rect.y);
+        const int xmax = min(ntile.x + ntile.w, rect.z);
+        const int ymax = min(ntile.y + ntile.h, rect.w);
+
+        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
+        float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
+
+        const size_t merged_stride = rect_size.x;
+        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
+        const float *merged_buffer = merged.data() + merged_offset * pass_stride;
+
+        for (int y = ymin; y < ymax; y++) {
+          for (int x = 0; x < pass_stride * (xmax - xmin); x += pass_stride) {
+            tile_buffer[x + 0] = merged_buffer[x + 0] * invscale;
+            tile_buffer[x + 1] = merged_buffer[x + 1] * invscale;
+            tile_buffer[x + 2] = merged_buffer[x + 2] * invscale;
+          }
+          tile_buffer += ntile.stride * pass_stride;
+          merged_buffer += merged_stride * pass_stride;
+        }
+      }
+
+      task.unmap_neighbor_tiles(neighbors, this);
+    }
+  }
+
   void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
   {
     ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
@@ -1040,7 +1192,7 @@ class CPUDevice : public Device {
     denoising.render_buffer.samples = tile.sample;
     denoising.buffer.gpu_temporary_mem = false;
 
-    denoising.run_denoising(&tile);
+    denoising.run_denoising(tile);
   }
 
   void thread_render(DeviceTask &task)
@@ -1070,10 +1222,23 @@ class CPUDevice : public Device {
       }
     }
 
+    /* NLM denoiser. */
     DenoisingTask *denoising = NULL;
 
+    /* OpenImageDenoise: we can only denoise with one thread at a time, so to
+     * avoid waiting with mutex locks in the denoiser, we let only a single
+     * thread acquire denoising tiles. */
+    uint tile_types = task.tile_types;
+    bool hold_denoise_lock = false;
+    if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+      if (!oidn_task_lock.try_lock()) {
+        tile_types &= ~RenderTile::DENOISE;
+        hold_denoise_lock = true;
+      }
+    }
+
     RenderTile tile;
-    while (task.acquire_tile(this, tile, task.tile_types)) {
+    while (task.acquire_tile(this, tile, tile_types)) {
       if (tile.task == RenderTile::PATH_TRACE) {
         if (use_split_kernel) {
           device_only_memory<uchar> void_buffer(this, "void_buffer");
@@ -1108,6 +1273,10 @@ class CPUDevice : public Device {
       }
     }
 
+    if (hold_denoise_lock) {
+      oidn_task_lock.unlock();
+    }
+
     profiler.remove_state(&kg->profiler);
 
     thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 89de80a5bcd..38c42d15cab 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -71,29 +71,30 @@ DenoisingTask::~DenoisingTask()
   tile_info_mem.free();
 }
 
-void DenoisingTask::set_render_buffer(RenderTile *rtiles)
+void DenoisingTask::set_render_buffer(RenderTileNeighbors &neighbors)
 {
-  for (int i = 0; i < 9; i++) {
-    tile_info->offsets[i] = rtiles[i].offset;
-    tile_info->strides[i] = rtiles[i].stride;
-    tile_info->buffers[i] = rtiles[i].buffer;
+  for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+    RenderTile &rtile = neighbors.tiles[i];
+    tile_info->offsets[i] = rtile.offset;
+    tile_info->strides[i] = rtile.stride;
+    tile_info->buffers[i] = rtile.buffer;
   }
-  tile_info->x[0] = rtiles[3].x;
-  tile_info->x[1] = rtiles[4].x;
-  tile_info->x[2] = rtiles[5].x;
-  tile_info->x[3] = rtiles[5].x + rtiles[5].w;
-  tile_info->y[0] = rtiles[1].y;
-  tile_info->y[1] = rtiles[4].y;
-  tile_info->y[2] = rtiles[7].y;
-  tile_info->y[3] = rtiles[7].y + rtiles[7].h;
-
-  target_buffer.offset = rtiles[9].offset;
-  target_buffer.stride = rtiles[9].stride;
-  target_buffer.ptr = rtiles[9].buffer;
-
-  if (do_prefilter && rtiles[9].buffers) {
+  tile_info->x[0] = neighbors.tiles[3].x;
+  tile_info->x[1] = neighbors.tiles[4].x;
+  tile_info->x[2] = neighbors.tiles[5].x;
+  tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
+  tile_info->y[0] = neighbors.tiles[1].y;
+  tile_info->y[1] = neighbors.tiles[4].y;
+  tile_info->y[2] = neighbors.tiles[7].y;
+  tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
+
+  target_buffer.offset = neighbors.target.offset;
+  target_buffer.stride = neighbors.target.stride;
+  target_buffer.ptr = neighbors.target.buffer;
+
+  if (do_prefilter && neighbors.target.buffers) {
     target_buffer.denoising_output_offset =
-        rtiles[9].buffers->params.get_denoising_prefiltered_offset();
+        neighbors.target.buffers->params.get_denoising_prefiltered_offset();
   }
   else {
     target_buffer.denoising_output_offset = 0;
@@ -320,12 +321,11 @@ void DenoisingTask::reconstruct()
   functions.solve(target_buffer.ptr);
 }
 
-void DenoisingTask::run_denoising(RenderTile *tile)
+void DenoisingTask::run_denoising(RenderTile &tile)
 {
-  RenderTile rtiles[10];
-  rtiles[4] = *tile;
-  functions.map_neighbor_tiles(rtiles);
-  set_render_buffer(rtiles);
+  RenderTileNeighbors neighbors(tile);
+  functions.map_neighbor_tiles(neighbors);
+  set_render_buffer(neighbors);
 
   setup_denoising_buffer();
 
@@ -347,7 +347,7 @@ void DenoisingTask::run_denoising(RenderTile *tile)
     write_buffer();
   }
 
-  functions.unmap_neighbor_tiles(rtiles);
+  functions.unmap_neighbor_tiles(neighbors);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index 4c122e981eb..2c0dc23b44a 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -102,8 +102,8 @@ class DenoisingTask {
                   device_ptr output_ptr)>
         detect_outliers;
     function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature;
-    function<void(RenderTile *rtiles)> map_neighbor_tiles;
-    function<void(RenderTile *rtiles)> unmap_neighbor_tiles;
+    function<void(RenderTileNeighbors &neighbors)> map_neighbor_tiles;
+    function<void(RenderTileNeighbors &neighbors)> unmap_neighbor_tiles;
   } functions;
 
   /* Stores state of the current Reconstruction operation,
@@ -154,7 +154,7 @@ class DenoisingTask {
   DenoisingTask(Device *device, const DeviceTask &task);
   ~DenoisingTask();
 
-  void run_denoising(RenderTile *tile);
+  void run_denoising(RenderTile &tile);
 
   struct DenoiseBuffers {
     int pass_stride;
@@ -179,7 +179,7 @@ class DenoisingTask {
  protected:
   Device *device;
 
-  void set_render_buffer(RenderTile *rtiles);
+  void set_render_buffer(RenderTileNeighbors &neighbors);
   void setup_denoising_buffer();
   void prefilter_shadowing();
   void prefilter_features();
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index fd14bbdccc5..9ea8782d0f0 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -177,8 +177,11 @@ class MultiDevice : public Device {
         return false;
 
     if (requested_features.use_denoising) {
+      /* Only need denoising feature, everything else is unused. */
+      DeviceRequestedFeatures denoising_features;
+      denoising_features.use_denoising = true;
       foreach (SubDevice &sub, denoising_devices)
-        if (!sub.device->load_kernels(requested_features))
+        if (!sub.device->load_kernels(denoising_features))
           return false;
     }
 
@@ -581,20 +584,22 @@ class MultiDevice : public Device {
     return -1;
   }
 
-  void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+  void map_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors)
   {
-    for (int i = 0; i < 9; i++) {
-      if (!tiles[i].buffers) {
+    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+      RenderTile &tile = neighbors.tiles[i];
+
+      if (!tile.buffers) {
         continue;
       }
 
-      device_vector<float> &mem = tiles[i].buffers->buffer;
-      tiles[i].buffer = mem.device_pointer;
+      device_vector<float> &mem = tile.buffers->buffer;
+      tile.buffer = mem.device_pointer;
 
       if (mem.device == this && matching_rendering_and_denoising_devices) {
         /* Skip unnecessary copies in viewport mode (buffer covers the
          * whole image), but still need to fix up the tile device pointer. */
-        map_tile(sub_device, tiles[i]);
+        map_tile(sub_device, tile);
         continue;
       }
 
@@ -607,15 +612,15 @@ class MultiDevice : public Device {
          * also required for the case where a CPU thread is denoising
          * a tile rendered on the GPU. In that case we have to avoid
          * overwriting the buffer being de-noised by the CPU thread. */
-        if (!tiles[i].buffers->map_neighbor_copied) {
-          tiles[i].buffers->map_neighbor_copied = true;
+        if (!tile.buffers->map_neighbor_copied) {
+          tile.buffers->map_neighbor_copied = true;
           mem.copy_from_device();
         }
 
         if (mem.device == this) {
           /* Can re-use memory if tile is already allocated on the sub device. */
-          map_tile(sub_device, tiles[i]);
-          mem.swap_device(sub_device, mem.device_size, tiles[i].buffer);
+          map_tile(sub_device, tile);
+          mem.swap_device(sub_device, mem.device_size, tile.buffer);
         }
         else {
           mem.swap_device(sub_device, 0, 0);
@@ -623,40 +628,42 @@ class MultiDevice : public Device {
 
         mem.copy_to_device();
 
-        tiles[i].buffer = mem.device_pointer;
-        tiles[i].device_size = mem.device_size;
+        tile.buffer = mem.device_pointer;
+        tile.device_size = mem.device_size;
 
         mem.restore_device();
       }
     }
   }
 
-  void unmap_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+  void unmap_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors)
   {
-    device_vector<float> &mem = tiles[9].buffers->buffer;
+    RenderTile &target_tile = neighbors.target;
+    device_vector<float> &mem = target_tile.buffers->buffer;
 
     if (mem.device == this && matching_rendering_and_denoising_devices) {
       return;
     }
 
     /* Copy denoised result back to the host. */
-    mem.swap_device(sub_device, tiles[9].device_size, tiles[9].buffer);
+    mem.swap_device(sub_device, target_tile.device_size, target_tile.buffer);
     mem.copy_from_device();
     mem.restore_device();
 
     /* Copy denoised result to the original device. */
     mem.copy_to_device();
 
-    for (int i = 0; i < 9; i++) {
-      if (!tiles[i].buffers) {
+    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+      RenderTile &tile = neighbors.tiles[i];
+      if (!tile.buffers) {
         continue;
       }
 
-      device_vector<float> &mem = tiles[i].buffers->buffer;
+      device_vector<float> &mem = tile.buffers->buffer;
 
       if (mem.device != sub_device && mem.device != this) {
         /* Free up memory again if it was allocated for the copy above. */
-        mem.swap_device(sub_device, tiles[i].device_size, tiles[i].buffer);
+        mem.swap_device(sub_device, tile.device_size, tile.buffer);
         sub_device->mem_free(mem);
         mem.restore_device();
       }
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index ececca3df53..1cc45983565 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -131,8 +131,12 @@ class OptiXDevice : public CUDADevice {
     PG_RGEN,
     PG_MISS,
     PG_HITD,  // Default hit group
-    PG_HITL,  // __BVH_LOCAL__ hit group
     PG_HITS,  // __SHADOW_RECORD_ALL__ hit group
+    PG_HITL,  // __BVH_LOCAL__ hit group (only used for triangles)
+#  if OPTIX_ABI_VERSION >= 36
+    PG_HITD_MOTION,
+    PG_HITS_MOTION,
+#  endif
 #  ifdef WITH_CYCLES_DEBUG
     PG_EXCP,
 #  endif
@@ -177,6 +181,7 @@ class OptiXDevice : public CUDADevice {
   OptixDeviceContext context = NULL;
 
   OptixModule optix_module = NULL;  // All necessary OptiX kernels are in one module
+  OptixModule builtin_modules[2] = {};
   OptixPipeline pipelines[NUM_PIPELINES] = {};
 
   bool motion_blur = false;
@@ -264,6 +269,9 @@ class OptiXDevice : public CUDADevice {
     // Unload modules
     if (optix_module != NULL)
       optixModuleDestroy(optix_module);
+    for (unsigned int i = 0; i < 2; ++i)
+      if (builtin_modules[i] != NULL)
+        optixModuleDestroy(builtin_modules[i]);
     for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
       if (pipelines[i] != NULL)
         optixPipelineDestroy(pipelines[i]);
@@ -338,6 +346,12 @@ class OptiXDevice : public CUDADevice {
       optixModuleDestroy(optix_module);
       optix_module = NULL;
     }
+    for (unsigned int i = 0; i < 2; ++i) {
+      if (builtin_modules[i] != NULL) {
+        optixModuleDestroy(builtin_modules[i]);
+        builtin_modules[i] = NULL;
+      }
+    }
     for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
       if (pipelines[i] != NULL) {
         optixPipelineDestroy(pipelines[i]);
@@ -369,6 +383,18 @@ class OptiXDevice : public CUDADevice {
 #  endif
     pipeline_options.pipelineLaunchParamsVariableName = "__params";  // See kernel_globals.h
 
+#  if OPTIX_ABI_VERSION >= 36
+    pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
+    if (requested_features.use_hair) {
+      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
+        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+      }
+      else {
+        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+      }
+    }
+#  endif
+
     // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
     // This is necessary since objects may be reported to have motion if the Vector pass is
     // active, but may still need to be rendered without motion blur if that isn't active as well
@@ -442,6 +468,34 @@ class OptiXDevice : public CUDADevice {
         group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
         group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
       }
+
+#  if OPTIX_ABI_VERSION >= 36
+      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
+        OptixBuiltinISOptions builtin_options;
+        builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+        builtin_options.usesMotionBlur = false;
+
+        check_result_optix_ret(optixBuiltinISModuleGet(
+            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
+
+        group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
+        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
+        group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
+        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
+
+        if (motion_blur) {
+          builtin_options.usesMotionBlur = true;
+
+          check_result_optix_ret(optixBuiltinISModuleGet(
+              context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
+
+          group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
+          group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
+          group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
+          group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
+        }
+      }
+#  endif
     }
 
     if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
@@ -493,8 +547,14 @@ class OptiXDevice : public CUDADevice {
     unsigned int trace_css = stack_size[PG_HITD].cssCH;
     // This is based on the maximum of closest-hit and any-hit/intersection programs
     trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
-    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
     trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+#  if OPTIX_ABI_VERSION >= 36
+    trace_css = std::max(trace_css,
+                         stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
+    trace_css = std::max(trace_css,
+                         stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+#  endif
 
     OptixPipelineLinkOptions link_options;
     link_options.maxTraceDepth = 1;
@@ -503,17 +563,23 @@ class OptiXDevice : public CUDADevice {
 #  else
     link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
 #  endif
-    link_options.overrideUsesMotionBlur = pipeline_options.usesMotionBlur;
+#  if OPTIX_ABI_VERSION < 24
+    link_options.overrideUsesMotionBlur = motion_blur;
+#  endif
 
     {  // Create path tracing pipeline
       OptixProgramGroup pipeline_groups[] = {
-          groups[PG_RGEN],
-          groups[PG_MISS],
-          groups[PG_HITD],
-          groups[PG_HITS],
-          groups[PG_HITL],
+        groups[PG_RGEN],
+        groups[PG_MISS],
+        groups[PG_HITD],
+        groups[PG_HITS],
+        groups[PG_HITL],
+#  if OPTIX_ABI_VERSION >= 36
+        groups[PG_HITD_MOTION],
+        groups[PG_HITS_MOTION],
+#  endif
 #  ifdef WITH_CYCLES_DEBUG
-          groups[PG_EXCP],
+        groups[PG_EXCP],
 #  endif
       };
       check_result_optix_ret(
@@ -530,8 +596,8 @@ class OptiXDevice : public CUDADevice {
       const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
 
       // Set stack size depending on pipeline options
-      check_result_optix_ret(optixPipelineSetStackSize(
-          pipelines[PIP_PATH_TRACE], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2)));
+      check_result_optix_ret(
+          optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE], 0, 0, css, (motion_blur ? 3 : 2)));
     }
 
     // Only need to create shader evaluation pipeline if one of these features is used:
@@ -541,15 +607,19 @@ class OptiXDevice : public CUDADevice {
 
     if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
       OptixProgramGroup pipeline_groups[] = {
-          groups[PG_BAKE],
-          groups[PG_DISP],
-          groups[PG_BACK],
-          groups[PG_MISS],
-          groups[PG_HITD],
-          groups[PG_HITS],
-          groups[PG_HITL],
+        groups[PG_BAKE],
+        groups[PG_DISP],
+        groups[PG_BACK],
+        groups[PG_MISS],
+        groups[PG_HITD],
+        groups[PG_HITS],
+        groups[PG_HITL],
+#  if OPTIX_ABI_VERSION >= 36
+        groups[PG_HITD_MOTION],
+        groups[PG_HITS_MOTION],
+#  endif
 #  ifdef WITH_CYCLES_DEBUG
-          groups[PG_EXCP],
+        groups[PG_EXCP],
 #  endif
       };
       check_result_optix_ret(
@@ -672,7 +742,11 @@ class OptiXDevice : public CUDADevice {
       sbt_params.missRecordCount = 1;
       sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
       sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITL, PG_HITS
+#  if OPTIX_ABI_VERSION >= 36
+      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
+#  else
+      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
+#  endif
 
       // Launch the ray generation program
       check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
@@ -727,19 +801,18 @@ class OptiXDevice : public CUDADevice {
       //   0 1 2
       //   3 4 5
       //   6 7 8  9
-      RenderTile rtiles[10];
-      rtiles[4] = rtile;
-      task.map_neighbor_tiles(rtiles, this);
-      rtile = rtiles[4];  // Tile may have been modified by mapping code
+      RenderTileNeighbors neighbors(rtile);
+      task.map_neighbor_tiles(neighbors, this);
+      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
+      RenderTile &target_tile = neighbors.target;
+      rtile = center_tile;  // Tile may have been modified by mapping code
 
       // Calculate size of the tile to denoise (including overlap)
-      int4 rect = make_int4(
-          rtiles[4].x, rtiles[4].y, rtiles[4].x + rtiles[4].w, rtiles[4].y + rtiles[4].h);
+      int4 rect = center_tile.bounds();
       // Overlap between tiles has to be at least 64 pixels
       // TODO(pmours): Query this value from OptiX
       rect = rect_expand(rect, 64);
-      int4 clip_rect = make_int4(
-          rtiles[3].x, rtiles[1].y, rtiles[5].x + rtiles[5].w, rtiles[7].y + rtiles[7].h);
+      int4 clip_rect = neighbors.bounds();
       rect = rect_clip(rect, clip_rect);
       int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
       int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
@@ -760,14 +833,14 @@ class OptiXDevice : public CUDADevice {
       device_only_memory<float> input(this, "denoiser input");
       device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_WRITE);
 
-      if ((!rtiles[0].buffer || rtiles[0].buffer == rtile.buffer) &&
-          (!rtiles[1].buffer || rtiles[1].buffer == rtile.buffer) &&
-          (!rtiles[2].buffer || rtiles[2].buffer == rtile.buffer) &&
-          (!rtiles[3].buffer || rtiles[3].buffer == rtile.buffer) &&
-          (!rtiles[5].buffer || rtiles[5].buffer == rtile.buffer) &&
-          (!rtiles[6].buffer || rtiles[6].buffer == rtile.buffer) &&
-          (!rtiles[7].buffer || rtiles[7].buffer == rtile.buffer) &&
-          (!rtiles[8].buffer || rtiles[8].buffer == rtile.buffer)) {
+      bool contiguous_memory = true;
+      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+        if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) {
+          contiguous_memory = false;
+        }
+      }
+
+      if (contiguous_memory) {
         // Tiles are in continous memory, so can just subtract overlap offset
         input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
         // Stride covers the whole width of the image and not just a single tile
@@ -782,19 +855,19 @@ class OptiXDevice : public CUDADevice {
         input_stride *= rect_size.x;
 
         TileInfo *tile_info = tile_info_mem.alloc(1);
-        for (int i = 0; i < 9; i++) {
-          tile_info->offsets[i] = rtiles[i].offset;
-          tile_info->strides[i] = rtiles[i].stride;
-          tile_info->buffers[i] = rtiles[i].buffer;
+        for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+          tile_info->offsets[i] = neighbors.tiles[i].offset;
+          tile_info->strides[i] = neighbors.tiles[i].stride;
+          tile_info->buffers[i] = neighbors.tiles[i].buffer;
         }
-        tile_info->x[0] = rtiles[3].x;
-        tile_info->x[1] = rtiles[4].x;
-        tile_info->x[2] = rtiles[5].x;
-        tile_info->x[3] = rtiles[5].x + rtiles[5].w;
-        tile_info->y[0] = rtiles[1].y;
-        tile_info->y[1] = rtiles[4].y;
-        tile_info->y[2] = rtiles[7].y;
-        tile_info->y[3] = rtiles[7].y + rtiles[7].h;
+        tile_info->x[0] = neighbors.tiles[3].x;
+        tile_info->x[1] = neighbors.tiles[4].x;
+        tile_info->x[2] = neighbors.tiles[5].x;
+        tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
+        tile_info->y[0] = neighbors.tiles[1].y;
+        tile_info->y[1] = neighbors.tiles[4].y;
+        tile_info->y[2] = neighbors.tiles[7].y;
+        tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
         tile_info_mem.copy_to_device();
 
         void *args[] = {
@@ -804,7 +877,7 @@ class OptiXDevice : public CUDADevice {
 
 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
       device_only_memory<float> input_rgb(this, "denoiser input rgb");
-      input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.optix_input_passes);
+      input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
 
       void *input_args[] = {&input_rgb.device_pointer,
                             &input_ptr,
@@ -813,7 +886,7 @@ class OptiXDevice : public CUDADevice {
                             &input_stride,
                             &task.pass_stride,
                             const_cast<int *>(pass_offset),
-                            &task.denoising.optix_input_passes,
+                            &task.denoising.input_passes,
                             &rtile.sample};
       launch_filter_kernel(
           "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
@@ -824,7 +897,7 @@ class OptiXDevice : public CUDADevice {
 #  endif
 
       const bool recreate_denoiser = (denoiser == NULL) ||
-                                     (task.denoising.optix_input_passes != denoiser_input_passes);
+                                     (task.denoising.input_passes != denoiser_input_passes);
       if (recreate_denoiser) {
         // Destroy existing handle before creating new one
         if (denoiser != NULL) {
@@ -833,23 +906,29 @@ class OptiXDevice : public CUDADevice {
 
         // Create OptiX denoiser handle on demand when it is first used
         OptixDenoiserOptions denoiser_options;
-        assert(task.denoising.optix_input_passes >= 1 && task.denoising.optix_input_passes <= 3);
+        assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3);
         denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
-            OPTIX_DENOISER_INPUT_RGB + (task.denoising.optix_input_passes - 1));
+            OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1));
+#  if OPTIX_ABI_VERSION < 28
         denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
+#  endif
         check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
         check_result_optix_ret(
             optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
 
         // OptiX denoiser handle was created with the requested number of input passes
-        denoiser_input_passes = task.denoising.optix_input_passes;
+        denoiser_input_passes = task.denoising.input_passes;
       }
 
       OptixDenoiserSizes sizes = {};
       check_result_optix_ret(
           optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
 
+#  if OPTIX_ABI_VERSION < 28
       const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
+#  else
+      const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
+#  endif
       const size_t scratch_offset = sizes.stateSizeInBytes;
 
       // Allocate denoiser state if tile size has changed since last setup
@@ -897,10 +976,10 @@ class OptiXDevice : public CUDADevice {
       int2 output_offset = overlap_offset;
       overlap_offset = make_int2(0, 0);  // Not supported by denoiser API, so apply manually
 #  else
-      output_layers[0].data = rtiles[9].buffer + pixel_offset;
-      output_layers[0].width = rtiles[9].w;
-      output_layers[0].height = rtiles[9].h;
-      output_layers[0].rowStrideInBytes = rtiles[9].stride * pixel_stride;
+      output_layers[0].data = target_tile.buffer + pixel_offset;
+      output_layers[0].width = target_tile.w;
+      output_layers[0].height = target_tile.h;
+      output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride;
       output_layers[0].pixelStrideInBytes = pixel_stride;
 #  endif
       output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
@@ -913,7 +992,7 @@ class OptiXDevice : public CUDADevice {
                                                  denoiser_state.device_pointer,
                                                  scratch_offset,
                                                  input_layers,
-                                                 task.denoising.optix_input_passes,
+                                                 task.denoising.input_passes,
                                                  overlap_offset.x,
                                                  overlap_offset.y,
                                                  output_layers,
@@ -922,26 +1001,26 @@ class OptiXDevice : public CUDADevice {
 
 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
       void *output_args[] = {&input_ptr,
-                             &rtiles[9].buffer,
+                             &target_tile.buffer,
                              &output_offset.x,
                              &output_offset.y,
                              &rect_size.x,
                              &rect_size.y,
-                             &rtiles[9].x,
-                             &rtiles[9].y,
-                             &rtiles[9].w,
-                             &rtiles[9].h,
-                             &rtiles[9].offset,
-                             &rtiles[9].stride,
+                             &target_tile.x,
+                             &target_tile.y,
+                             &target_tile.w,
+                             &target_tile.h,
+                             &target_tile.offset,
+                             &target_tile.stride,
                              &task.pass_stride,
                              &rtile.sample};
       launch_filter_kernel(
-          "kernel_cuda_filter_convert_from_rgb", rtiles[9].w, rtiles[9].h, output_args);
+          "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args);
 #  endif
 
       check_result_cuda_ret(cuStreamSynchronize(0));
 
-      task.unmap_neighbor_tiles(rtiles, this);
+      task.unmap_neighbor_tiles(neighbors, this);
     }
     else {
       // Run CUDA denoising kernels
@@ -993,7 +1072,11 @@ class OptiXDevice : public CUDADevice {
       sbt_params.missRecordCount = 1;
       sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
       sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITL, PG_HITS
+#  if OPTIX_ABI_VERSION >= 36
+      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
+#  else
+      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
+#  endif
 
       check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
                                      cuda_stream[thread_index],
@@ -1070,7 +1153,7 @@ class OptiXDevice : public CUDADevice {
                                            &build_input,
                                            1,
                                            temp_mem.device_pointer,
-                                           temp_mem.device_size,
+                                           sizes.tempSizeInBytes,
                                            out_data,
                                            sizes.outputSizeInBytes,
                                            &out_handle,
@@ -1142,7 +1225,6 @@ class OptiXDevice : public CUDADevice {
           continue;
         }
 
-        const size_t num_curves = hair->num_curves();
         const size_t num_segments = hair->num_segments();
 
         size_t num_motion_steps = 1;
@@ -1152,7 +1234,18 @@ class OptiXDevice : public CUDADevice {
         }
 
         device_vector<OptixAabb> aabb_data(this, "temp_aabb_data", MEM_READ_ONLY);
-        aabb_data.alloc(num_segments * num_motion_steps);
+#  if OPTIX_ABI_VERSION >= 36
+        device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
+        device_vector<float4> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
+        // Four control points for each curve segment
+        const size_t num_vertices = num_segments * 4;
+        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+          index_data.alloc(num_segments);
+          vertex_data.alloc(num_vertices * num_motion_steps);
+        }
+        else
+#  endif
+          aabb_data.alloc(num_segments * num_motion_steps);
 
         // Get AABBs for each motion step
         for (size_t step = 0; step < num_motion_steps; ++step) {
@@ -1165,44 +1258,127 @@ class OptiXDevice : public CUDADevice {
             keys = motion_keys->data_float3() + attr_offset * hair->curve_keys.size();
           }
 
-          size_t i = step * num_segments;
-          for (size_t j = 0; j < num_curves; ++j) {
-            const Hair::Curve c = hair->get_curve(j);
-
-            for (size_t k = 0; k < c.num_segments(); ++i, ++k) {
-              BoundBox bounds = BoundBox::empty;
-              c.bounds_grow(k, keys, hair->curve_radius.data(), bounds);
-
-              aabb_data[i].minX = bounds.min.x;
-              aabb_data[i].minY = bounds.min.y;
-              aabb_data[i].minZ = bounds.min.z;
-              aabb_data[i].maxX = bounds.max.x;
-              aabb_data[i].maxY = bounds.max.y;
-              aabb_data[i].maxZ = bounds.max.z;
+          for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+            const Hair::Curve curve = hair->get_curve(j);
+
+            for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+#  if OPTIX_ABI_VERSION >= 36
+              if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+                int k0 = curve.first_key + segment;
+                int k1 = k0 + 1;
+                int ka = max(k0 - 1, curve.first_key);
+                int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
+
+                const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
+                const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
+                const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
+                const float4 pw = make_float4(hair->curve_radius[ka],
+                                              hair->curve_radius[k0],
+                                              hair->curve_radius[k1],
+                                              hair->curve_radius[kb]);
+
+                // Convert Catmull-Rom data to Bezier spline
+                static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
+                static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
+                static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
+                static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
+
+                index_data[i] = i * 4;
+                float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+                v[0] = make_float4(
+                    dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
+                v[1] = make_float4(
+                    dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
+                v[2] = make_float4(
+                    dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
+                v[3] = make_float4(
+                    dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+              }
+              else
+#  endif
+              {
+                BoundBox bounds = BoundBox::empty;
+                curve.bounds_grow(segment, keys, hair->curve_radius.data(), bounds);
+
+                const size_t index = step * num_segments + i;
+                aabb_data[index].minX = bounds.min.x;
+                aabb_data[index].minY = bounds.min.y;
+                aabb_data[index].minZ = bounds.min.z;
+                aabb_data[index].maxX = bounds.max.x;
+                aabb_data[index].maxY = bounds.max.y;
+                aabb_data[index].maxZ = bounds.max.z;
+              }
             }
           }
         }
 
         // Upload AABB data to GPU
         aabb_data.copy_to_device();
+#  if OPTIX_ABI_VERSION >= 36
+        index_data.copy_to_device();
+        vertex_data.copy_to_device();
+#  endif
 
         vector<device_ptr> aabb_ptrs;
         aabb_ptrs.reserve(num_motion_steps);
+#  if OPTIX_ABI_VERSION >= 36
+        vector<device_ptr> width_ptrs;
+        vector<device_ptr> vertex_ptrs;
+        width_ptrs.reserve(num_motion_steps);
+        vertex_ptrs.reserve(num_motion_steps);
+#  endif
         for (size_t step = 0; step < num_motion_steps; ++step) {
           aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+#  if OPTIX_ABI_VERSION >= 36
+          const device_ptr base_ptr = vertex_data.device_pointer +
+                                      step * num_vertices * sizeof(float4);
+          width_ptrs.push_back(base_ptr + 3 * sizeof(float));  // Offset by vertex size
+          vertex_ptrs.push_back(base_ptr);
+#  endif
         }
 
-        // Disable visibility test anyhit program, since it is already checked during intersection
-        // Those trace calls that require anyhit can force it with OPTIX_RAY_FLAG_ENFORCE_ANYHIT
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+        // Force a single any-hit call, so shadow record-all behavior works correctly
+        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
         OptixBuildInput build_input = {};
-        build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
-        build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
-        build_input.aabbArray.numPrimitives = num_segments;
-        build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
-        build_input.aabbArray.flags = &build_flags;
-        build_input.aabbArray.numSbtRecords = 1;
-        build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  if OPTIX_ABI_VERSION >= 36
+        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
+          build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+          build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+          build_input.curveArray.numPrimitives = num_segments;
+          build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+          build_input.curveArray.numVertices = num_vertices;
+          build_input.curveArray.vertexStrideInBytes = sizeof(float4);
+          build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
+          build_input.curveArray.widthStrideInBytes = sizeof(float4);
+          build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
+          build_input.curveArray.indexStrideInBytes = sizeof(int);
+          build_input.curveArray.flag = build_flags;
+          build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
+        }
+        else
+#  endif
+        {
+          // Disable visibility test any-hit program, since it is already checked during
+          // intersection. Those trace calls that require anyhit can force it with a ray flag.
+          build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+          build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+#  if OPTIX_ABI_VERSION < 23
+          build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+          build_input.aabbArray.numPrimitives = num_segments;
+          build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
+          build_input.aabbArray.flags = &build_flags;
+          build_input.aabbArray.numSbtRecords = 1;
+          build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  else
+          build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+          build_input.customPrimitiveArray.numPrimitives = num_segments;
+          build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+          build_input.customPrimitiveArray.flags = &build_flags;
+          build_input.customPrimitiveArray.numSbtRecords = 1;
+          build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
+#  endif
+        }
 
         // Allocate memory for new BLAS and build it
         OptixTraversableHandle handle;
@@ -1257,8 +1433,8 @@ class OptiXDevice : public CUDADevice {
           vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
         }
 
-        // No special build flags for triangle primitives
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_NONE;
+        // Force a single any-hit call, so shadow record-all behavior works correctly
+        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
         OptixBuildInput build_input = {};
         build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
         build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
@@ -1324,9 +1500,26 @@ class OptiXDevice : public CUDADevice {
       // Set user instance ID to object index
       instance.instanceId = ob->get_device_index();
 
-      // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
-      // See 'scene_intersect_volume' in bvh.h
-      instance.visibilityMask = (ob->geometry->has_volume ? 3 : 1);
+      // Have to have at least one bit in the mask, or else instance would always be culled
+      instance.visibilityMask = 1;
+
+      if (ob->geometry->has_volume) {
+        // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
+        instance.visibilityMask |= 2;
+      }
+
+      if (ob->geometry->type == Geometry::HAIR) {
+        // Same applies to curves (so they can be skipped in local trace calls)
+        instance.visibilityMask |= 4;
+
+#  if OPTIX_ABI_VERSION >= 36
+        if (motion_blur && ob->geometry->has_motion_blur() && DebugFlags().optix.curves_api &&
+            static_cast<const Hair *>(ob->geometry)->curve_shape == CURVE_THICK) {
+          // Select between motion blur and non-motion blur built-in intersection module
+          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+        }
+#  endif
+      }
 
       // Insert motion traversable if object has motion
       if (motion_blur && ob->use_motion()) {
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 600973b8100..fd380788282 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -29,6 +29,7 @@ CCL_NAMESPACE_BEGIN
 class Device;
 class RenderBuffers;
 class RenderTile;
+class RenderTileNeighbors;
 class Tile;
 
 enum DenoiserType {
@@ -41,6 +42,14 @@ enum DenoiserType {
   DENOISER_ALL = ~0,
 };
 
+enum DenoiserInput {
+  DENOISER_INPUT_RGB = 1,
+  DENOISER_INPUT_RGB_ALBEDO = 2,
+  DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3,
+
+  DENOISER_INPUT_NUM,
+};
+
 typedef int DenoiserTypeMask;
 
 class DenoiseParams {
@@ -72,10 +81,10 @@ class DenoiseParams {
   /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
   bool clamp_input;
 
-  /** Optix Denoiser **/
+  /** OIDN/Optix Denoiser **/
 
-  /* Passes handed over to the OptiX denoiser (default to color + albedo). */
-  int optix_input_passes;
+  /* Passes handed over to the OIDN/OptiX denoiser (default to color + albedo). */
+  DenoiserInput input_passes;
 
   DenoiseParams()
   {
@@ -91,7 +100,7 @@ class DenoiseParams {
     neighbor_frames = 2;
     clamp_input = true;
 
-    optix_input_passes = 2;
+    input_passes = DENOISER_INPUT_RGB_ALBEDO_NORMAL;
 
     start_sample = 0;
   }
@@ -150,8 +159,8 @@ class DeviceTask {
   function<void(RenderTile &)> update_tile_sample;
   function<void(RenderTile &)> release_tile;
   function<bool()> get_cancel;
-  function<void(RenderTile *, Device *)> map_neighbor_tiles;
-  function<void(RenderTile *, Device *)> unmap_neighbor_tiles;
+  function<void(RenderTileNeighbors &, Device *)> map_neighbor_tiles;
+  function<void(RenderTileNeighbors &, Device *)> unmap_neighbor_tiles;
 
   uint tile_types;
   DenoiseParams denoising;
diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
index 8c94815b193..e851749949d 100644
--- a/intern/cycles/device/opencl/device_opencl_impl.cpp
+++ b/intern/cycles/device/opencl/device_opencl_impl.cpp
@@ -1850,7 +1850,7 @@ void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
   denoising.render_buffer.samples = rtile.sample;
   denoising.buffer.gpu_temporary_mem = true;
 
-  denoising.run_denoising(&rtile);
+  denoising.run_denoising(rtile);
 }
 
 void OpenCLDevice::shader(DeviceTask &task)