7 files changed, 41 insertions, 24 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index b9784f68f56..aff21ef59bb 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -257,7 +257,8 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
    * become busy after adding new tiles). This is especially important for the shadow catcher which
    * schedules work in halves of available number of paths. */
   work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
-
+  work_tile_scheduler_.set_accelerated_rt((device_->get_bvh_layout_mask() & BVH_LAYOUT_OPTIX) !=
+                                          0);
   work_tile_scheduler_.reset(effective_buffer_params_,
                              start_sample,
                              samples_num,
diff --git a/intern/cycles/integrator/tile.cpp b/intern/cycles/integrator/tile.cpp
index 4a1558cce09..e9a3cbd38aa 100644
--- a/intern/cycles/integrator/tile.cpp
+++ b/intern/cycles/integrator/tile.cpp
@@ -46,7 +46,8 @@ ccl_device_inline uint round_up_to_power_of_two(uint x)
   return next_power_of_two(x);
 }
 
-TileSize tile_calculate_best_size(const int2 &image_size,
+TileSize tile_calculate_best_size(const bool accel_rt,
+                                  const int2 &image_size,
                                   const int num_samples,
                                   const int max_num_path_states,
                                   const float scrambling_distance)
@@ -73,7 +74,7 @@ TileSize tile_calculate_best_size(const int2 &image_size,
 
   TileSize tile_size;
   const int num_path_states_per_sample = max_num_path_states / num_samples;
-  if (scrambling_distance < 0.9f) {
+  if (scrambling_distance < 0.9f && accel_rt) {
     /* Prefer large tiles for scrambling distance, bounded by max num path states. */
     tile_size.width = min(image_size.x, max_num_path_states);
     tile_size.height = min(image_size.y, max(max_num_path_states / tile_size.width, 1));
diff --git a/intern/cycles/integrator/tile.h b/intern/cycles/integrator/tile.h
index 61f7d736115..05b1e0af6b1 100644
--- a/intern/cycles/integrator/tile.h
+++ b/intern/cycles/integrator/tile.h
@@ -49,7 +49,8 @@ std::ostream &operator<<(std::ostream &os, const TileSize &tile_size);
  * of active path states.
  * Will attempt to provide best guess to keep path tracing threads of a device as localized as
  * possible, and have as many threads active for every tile as possible. */
-TileSize tile_calculate_best_size(const int2 &image_size,
+TileSize tile_calculate_best_size(const bool accel_rt,
+                                  const int2 &image_size,
                                   const int num_samples,
                                   const int max_num_path_states,
                                   const float scrambling_distance);
diff --git a/intern/cycles/integrator/work_tile_scheduler.cpp b/intern/cycles/integrator/work_tile_scheduler.cpp
index 2d1ac07db7f..cac573dfeda 100644
--- a/intern/cycles/integrator/work_tile_scheduler.cpp
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -28,6 +28,11 @@ WorkTileScheduler::WorkTileScheduler()
 {
 }
 
+void WorkTileScheduler::set_accelerated_rt(bool accelerated_rt)
+{
+  accelerated_rt_ = accelerated_rt;
+}
+
 void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
 {
   max_num_path_states_ = max_num_path_states;
@@ -59,7 +64,7 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params,
 void WorkTileScheduler::reset_scheduler_state()
 {
   tile_size_ = tile_calculate_best_size(
-      image_size_px_, samples_num_, max_num_path_states_, scrambling_distance_);
+      accelerated_rt_, image_size_px_, samples_num_, max_num_path_states_, scrambling_distance_);
 
   VLOG(3) << "Will schedule tiles of size " << tile_size_;
 
diff --git a/intern/cycles/integrator/work_tile_scheduler.h b/intern/cycles/integrator/work_tile_scheduler.h
index d9fa7e84431..8aa2f8e90bd 100644
--- a/intern/cycles/integrator/work_tile_scheduler.h
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@@ -31,6 +31,9 @@ class WorkTileScheduler {
  public:
   WorkTileScheduler();
 
+  /* To indicate if there is accelerated RT support. */
+  void set_accelerated_rt(bool state);
+
   /* MAximum path states which are allowed to be used by a single scheduled work tile.
    *
    * Affects the scheduled work size: the work size will be as big as possible, but will not exceed
@@ -54,6 +57,9 @@ class WorkTileScheduler {
  protected:
   void reset_scheduler_state();
 
+  /* Used to indicate if there is accelerated ray tracing. */
+  bool accelerated_rt_ = false;
+
   /* Maximum allowed path states to be used.
    *
    * TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the
diff --git a/intern/cycles/kernel/device/gpu/work_stealing.h b/intern/cycles/kernel/device/gpu/work_stealing.h
index fab0915c38e..c3083948057 100644
--- a/intern/cycles/kernel/device/gpu/work_stealing.h
+++ b/intern/cycles/kernel/device/gpu/work_stealing.h
@@ -29,17 +29,20 @@ ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
                                       ccl_private uint *y,
                                       ccl_private uint *sample)
 {
-#if 0
-  /* Keep threads for the same sample together. */
-  uint tile_pixels = tile->w * tile->h;
-  uint sample_offset = global_work_index / tile_pixels;
-  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#else
-  /* Keeping threads for the same pixel together.
-   * Appears to improve performance by a few % on CUDA and OptiX. */
-  uint sample_offset = global_work_index % tile->num_samples;
-  uint pixel_offset = global_work_index / tile->num_samples;
-#endif
+  uint sample_offset, pixel_offset;
+
+  if (kernel_data.integrator.scrambling_distance < 0.9f) {
+    /* Keep threads for the same sample together. */
+    uint tile_pixels = tile->w * tile->h;
+    sample_offset = global_work_index / tile_pixels;
+    pixel_offset = global_work_index - sample_offset * tile_pixels;
+  }
+  else {
+    /* Keeping threads for the same pixel together.
+     * Appears to improve performance by a few % on CUDA and OptiX. */
+    sample_offset = global_work_index % tile->num_samples;
+    pixel_offset = global_work_index / tile->num_samples;
+  }
 
   uint y_offset = pixel_offset / tile->w;
   uint x_offset = pixel_offset - y_offset * tile->w;
diff --git a/intern/cycles/test/integrator_tile_test.cpp b/intern/cycles/test/integrator_tile_test.cpp
index 8bb0856d6a9..822c34c36bf 100644
--- a/intern/cycles/test/integrator_tile_test.cpp
+++ b/intern/cycles/test/integrator_tile_test.cpp
@@ -24,26 +24,26 @@ CCL_NAMESPACE_BEGIN
 TEST(tile_calculate_best_size, Basic)
 {
   /* Make sure CPU-like case is handled properly. */
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1, 1.0f), TileSize(1, 1, 1));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1, 1.0f), TileSize(1, 1, 1));
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 1, 1, 1.0f), TileSize(1, 1, 1));
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 100, 1, 1.0f), TileSize(1, 1, 1));
 
   /* Enough path states to fit an entire image with all samples. */
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1920 * 1080, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 1, 1920 * 1080, 1.0f),
             TileSize(1920, 1080, 1));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1920 * 1080 * 100, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 100, 1920 * 1080 * 100, 1.0f),
             TileSize(1920, 1080, 100));
 }
 
 TEST(tile_calculate_best_size, Extreme)
 {
-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 262144, 131072, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 262144, 131072, 1.0f),
             TileSize(1, 1, 512));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 1048576, 131072, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 1048576, 131072, 1.0f),
             TileSize(1, 1, 1024));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 10485760, 131072, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 10485760, 131072, 1.0f),
             TileSize(1, 1, 4096));
 
-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 8192 * 8192 * 2, 1024, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 8192 * 8192 * 2, 1024, 1.0f),
             TileSize(1, 1, 1024));
 }