8 files changed, 119 insertions, 50 deletions
diff --git a/intern/cycles/integrator/denoiser.cpp b/intern/cycles/integrator/denoiser.cpp
index 94991d63e4c..831bd3a4407 100644
--- a/intern/cycles/integrator/denoiser.cpp
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -101,10 +101,17 @@ static Device *find_best_device(Device *device, DenoiserType type)
     if ((sub_device->info.denoisers & type) == 0) {
       return;
     }
+
     if (!best_device) {
       best_device = sub_device;
     }
     else {
+      /* Prefer a device that can use graphics interop for faster display update. */
+      if (sub_device->should_use_graphics_interop() &&
+          !best_device->should_use_graphics_interop()) {
+        best_device = sub_device;
+      }
+
       /* TODO(sergey): Choose fastest device from available ones. Taking into account performance
        * of the device and data transfer cost. */
     }
diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp
index 05318b7545b..ab056e953c2 100644
--- a/intern/cycles/integrator/pass_accessor.cpp
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -191,6 +191,12 @@ bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
        * had the computation done. */
       if (pass_info.num_components == 3) {
         get_pass_float3(render_buffers, buffer_params, destination);
+
+        /* Use alpha for colors passes. */
+        if (type == PASS_DIFFUSE_COLOR || type == PASS_GLOSSY_COLOR ||
+            type == PASS_TRANSMISSION_COLOR) {
+          num_written_components = destination.num_components;
+        }
       }
       else if (pass_info.num_components == 4) {
         if (destination.num_components == 3) {
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index 9ad1c465725..3ec7b601d9f 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -26,6 +26,7 @@ PathTrace::PathTrace(Device *device,
                      RenderScheduler &render_scheduler,
                      TileManager &tile_manager)
     : device_(device),
+      film_(film),
       device_scene_(device_scene),
       render_scheduler_(render_scheduler),
       tile_manager_(tile_manager)
@@ -60,7 +61,17 @@ PathTrace::~PathTrace()
 void PathTrace::load_kernels()
 {
   if (denoiser_) {
+    /* Activate graphics interop while denoiser device is created, so that it can choose a device
+     * that supports interop for faster display updates. */
+    if (display_ && path_trace_works_.size() > 1) {
+      display_->graphics_interop_activate();
+    }
+
     denoiser_->load_kernels(progress_);
+
+    if (display_ && path_trace_works_.size() > 1) {
+      display_->graphics_interop_deactivate();
+    }
   }
 }
 
@@ -373,7 +384,7 @@ void PathTrace::path_trace(RenderWork &render_work)
     work_balance_infos_[i].time_spent += work_time;
     work_balance_infos_[i].occupancy = statistics.occupancy;
 
-    VLOG_WORK << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
+    VLOG_INFO << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
               << work_time / num_samples
               << " seconds per sample), occupancy: " << statistics.occupancy;
   });
@@ -506,28 +517,30 @@ void PathTrace::denoise(const RenderWork &render_work)
   const double start_time = time_dt();
 
   RenderBuffers *buffer_to_denoise = nullptr;
-
-  unique_ptr<RenderBuffers> multi_device_buffers;
   bool allow_inplace_modification = false;
 
-  if (path_trace_works_.size() == 1) {
-    buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
+  Device *denoiser_device = denoiser_->get_denoiser_device();
+  if (path_trace_works_.size() > 1 && denoiser_device && !big_tile_denoise_work_) {
+    big_tile_denoise_work_ = PathTraceWork::create(denoiser_device, film_, device_scene_, nullptr);
   }
-  else {
-    Device *denoiser_device = denoiser_->get_denoiser_device();
-    if (!denoiser_device) {
-      return;
-    }
 
-    multi_device_buffers = make_unique<RenderBuffers>(denoiser_device);
-    multi_device_buffers->reset(render_state_.effective_big_tile_params);
+  if (big_tile_denoise_work_) {
+    big_tile_denoise_work_->set_effective_buffer_params(render_state_.effective_big_tile_params,
+                                                        render_state_.effective_big_tile_params,
+                                                        render_state_.effective_big_tile_params);
 
-    buffer_to_denoise = multi_device_buffers.get();
+    buffer_to_denoise = big_tile_denoise_work_->get_render_buffers();
+    buffer_to_denoise->reset(render_state_.effective_big_tile_params);
 
-    copy_to_render_buffers(multi_device_buffers.get());
+    copy_to_render_buffers(buffer_to_denoise);
 
     allow_inplace_modification = true;
   }
+  else {
+    DCHECK_EQ(path_trace_works_.size(), 1);
+
+    buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
+  }
 
   if (denoiser_->denoise_buffer(render_state_.effective_big_tile_params,
                                 buffer_to_denoise,
@@ -536,14 +549,6 @@ void PathTrace::denoise(const RenderWork &render_work)
     render_state_.has_denoised_result = true;
   }
 
-  if (multi_device_buffers) {
-    multi_device_buffers->copy_from_device();
-    parallel_for_each(
-        path_trace_works_, [&multi_device_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
-          path_trace_work->copy_from_denoised_render_buffers(multi_device_buffers.get());
-        });
-  }
-
   render_scheduler_.report_denoise_time(render_work, time_dt() - start_time);
 }
 
@@ -635,8 +640,13 @@ void PathTrace::update_display(const RenderWork &render_work)
     /* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from
      * all works in parallel. */
     const int num_samples = get_num_samples_in_buffer();
-    for (auto &&path_trace_work : path_trace_works_) {
-      path_trace_work->copy_to_display(display_.get(), pass_mode, num_samples);
+    if (big_tile_denoise_work_ && render_state_.has_denoised_result) {
+      big_tile_denoise_work_->copy_to_display(display_.get(), pass_mode, num_samples);
+    }
+    else {
+      for (auto &&path_trace_work : path_trace_works_) {
+        path_trace_work->copy_to_display(display_.get(), pass_mode, num_samples);
+      }
     }
 
     display_->update_end();
@@ -721,11 +731,10 @@ void PathTrace::write_tile_buffer(const RenderWork &render_work)
     VLOG_WORK << "Write tile result via buffer write callback.";
     tile_buffer_write();
   }
-
   /* Write tile to disk, so that the render work's render buffer can be re-used for the next tile.
    */
-  if (has_multiple_tiles) {
-    VLOG_WORK << "Write tile result into .";
+  else {
+    VLOG_WORK << "Write tile result to disk.";
     tile_buffer_write_to_disk();
   }
 }
@@ -901,6 +910,10 @@ bool PathTrace::copy_render_tile_from_device()
     return true;
   }
 
+  if (big_tile_denoise_work_ && render_state_.has_denoised_result) {
+    return big_tile_denoise_work_->copy_render_buffers_from_device();
+  }
+
   bool success = true;
 
   parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
@@ -1002,6 +1015,10 @@ bool PathTrace::get_render_tile_pixels(const PassAccessor &pass_accessor,
     return pass_accessor.get_render_tile_pixels(full_frame_state_.render_buffers, destination);
   }
 
+  if (big_tile_denoise_work_ && render_state_.has_denoised_result) {
+    return big_tile_denoise_work_->get_render_tile_pixels(pass_accessor, destination);
+  }
+
   bool success = true;
 
   parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
@@ -1082,6 +1099,10 @@ void PathTrace::destroy_gpu_resources()
     for (auto &&path_trace_work : path_trace_works_) {
       path_trace_work->destroy_gpu_resources(display_.get());
     }
+
+    if (big_tile_denoise_work_) {
+      big_tile_denoise_work_->destroy_gpu_resources(display_.get());
+    }
   }
 }
 
@@ -1103,6 +1124,8 @@ static const char *device_type_for_description(const DeviceType type)
       return "OptiX";
     case DEVICE_HIP:
       return "HIP";
+    case DEVICE_ONEAPI:
+      return "oneAPI";
     case DEVICE_DUMMY:
       return "Dummy";
     case DEVICE_MULTI:
diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h
index a470a6e1402..59382b51d23 100644
--- a/intern/cycles/integrator/path_trace.h
+++ b/intern/cycles/integrator/path_trace.h
@@ -236,6 +236,7 @@ class PathTrace {
   /* CPU device for creating temporary render buffers on the CPU side. */
   unique_ptr<Device> cpu_device_;
 
+  Film *film_;
   DeviceScene *device_scene_;
 
   RenderScheduler &render_scheduler_;
@@ -261,6 +262,9 @@ class PathTrace {
   /* Denoiser which takes care of denoising the big tile. */
   unique_ptr<Denoiser> denoiser_;
 
+  /* Denoiser device descriptor which holds the denoised big tile for multi-device workloads. */
+  unique_ptr<PathTraceWork> big_tile_denoise_work_;
+
   /* State which is common for all the steps of the render work.
    * Is brought up to date in the `render()` call and is accessed from all the steps involved into
    * rendering the work. */
diff --git a/intern/cycles/integrator/path_trace_tile.cpp b/intern/cycles/integrator/path_trace_tile.cpp
index 2f1f4e810a3..dfe88695013 100644
--- a/intern/cycles/integrator/path_trace_tile.cpp
+++ b/intern/cycles/integrator/path_trace_tile.cpp
@@ -33,7 +33,7 @@ bool PathTraceTile::get_pass_pixels(const string_view pass_name,
   if (!copied_from_device_) {
     /* Copy from device on demand. */
     path_trace_.copy_render_tile_from_device();
-    const_cast<PathTraceTile *>(this)->copied_from_device_ = true;
+    copied_from_device_ = true;
   }
 
   const BufferParams &buffer_params = path_trace_.get_render_tile_params();
diff --git a/intern/cycles/integrator/path_trace_tile.h b/intern/cycles/integrator/path_trace_tile.h
index 99ae08d04d1..223fa96e113 100644
--- a/intern/cycles/integrator/path_trace_tile.h
+++ b/intern/cycles/integrator/path_trace_tile.h
@@ -24,7 +24,7 @@ class PathTraceTile : public OutputDriver::Tile {
 
  private:
   PathTrace &path_trace_;
-  bool copied_from_device_;
+  mutable bool copied_from_device_;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 0acaeace4b0..ee250a6916b 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -181,27 +181,49 @@ void PathTraceWorkGPU::alloc_integrator_queue()
 
 void PathTraceWorkGPU::alloc_integrator_sorting()
 {
-  /* Allocate arrays for shader sorting. */
-  const int max_shaders = device_scene_->data.max_shaders;
-  if (integrator_shader_sort_counter_.size() < max_shaders) {
-    integrator_shader_sort_counter_.alloc(max_shaders);
-    integrator_shader_sort_counter_.zero_to_device();
+  /* Compute sort partitions, to balance between memory locality and coherence.
+   * Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a
+   * more sophisticated heuristic we simply disable sort partitioning if the shader count is high.
+   */
+  num_sort_partitions_ = 1;
+  if (device_scene_->data.max_shaders < 300) {
+    const int num_elements = queue_->num_sort_partition_elements();
+    if (num_elements) {
+      num_sort_partitions_ = max(max_num_paths_ / num_elements, 1);
+    }
+  }
 
-    integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
-    integrator_shader_raytrace_sort_counter_.zero_to_device();
+  integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
+                                                                num_sort_partitions_);
 
-    integrator_shader_mnee_sort_counter_.alloc(max_shaders);
-    integrator_shader_mnee_sort_counter_.zero_to_device();
+  /* Allocate arrays for shader sorting. */
+  const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
+  if (integrator_shader_sort_counter_.size() < sort_buckets) {
+    integrator_shader_sort_counter_.alloc(sort_buckets);
+    integrator_shader_sort_counter_.zero_to_device();
+    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+        (int *)integrator_shader_sort_counter_.device_pointer;
 
-    integrator_shader_sort_prefix_sum_.alloc(max_shaders);
+    integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
     integrator_shader_sort_prefix_sum_.zero_to_device();
+  }
 
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
-        (int *)integrator_shader_sort_counter_.device_pointer;
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
-        (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
-        (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+  if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
+      integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
+      integrator_shader_raytrace_sort_counter_.zero_to_device();
+      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+          (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+    }
+  }
+
+  if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+    if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
+      integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
+      integrator_shader_mnee_sort_counter_.zero_to_device();
+      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
+          (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+    }
   }
 }
 
@@ -239,7 +261,7 @@ void PathTraceWorkGPU::init_execution()
 
   /* Copy to device side struct in constant memory. */
   device_->const_copy_to(
-      "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
+      "integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
 }
 
 void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
@@ -333,8 +355,12 @@ void PathTraceWorkGPU::enqueue_reset()
   queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
   queue_->zero_to_device(integrator_queue_counter_);
   queue_->zero_to_device(integrator_shader_sort_counter_);
-  queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
-  queue_->zero_to_device(integrator_shader_mnee_sort_counter_);
+  if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+  }
+  if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+    queue_->zero_to_device(integrator_shader_mnee_sort_counter_);
+  }
 
   /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
    * counter on the host side because `zero_to_device()` is not doing it. */
@@ -486,9 +512,9 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
   /* Compute prefix sum of number of active paths with each shader. */
   {
     const int work_size = 1;
-    int max_shaders = device_scene_->data.max_shaders;
+    int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
 
-    DeviceKernelArguments args(&d_counter, &d_prefix_sum, &max_shaders);
+    DeviceKernelArguments args(&d_counter, &d_prefix_sum, &sort_buckets);
 
     queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
   }
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index 4c10a221a30..a805258d1b5 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -156,6 +156,9 @@ class PathTraceWorkGPU : public PathTraceWork {
   bool interop_use_checked_ = false;
   bool interop_use_ = false;
 
+  /* Number of partitions to sort state indices into prior to material sort. */
+  int num_sort_partitions_;
+
   /* Maximum number of concurrent integrator states. */
   int max_num_paths_;