1 files changed, 11 insertions, 76 deletions
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index d1602744f1d..fab0915c38e 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_WORK_STEALING_H__
-#define __KERNEL_WORK_STEALING_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -24,21 +23,24 @@ CCL_NAMESPACE_BEGIN
  */
 
 /* Map global work index to tile, pixel X/Y and sample. */
-ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
+ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
                                       uint global_work_index,
                                       ccl_private uint *x,
                                       ccl_private uint *y,
                                       ccl_private uint *sample)
 {
-#ifdef __KERNEL_CUDA__
-  /* Keeping threads for the same pixel together improves performance on CUDA. */
-  uint sample_offset = global_work_index % tile->num_samples;
-  uint pixel_offset = global_work_index / tile->num_samples;
-#else  /* __KERNEL_CUDA__ */
+#if 0
+  /* Keep threads for the same sample together. */
   uint tile_pixels = tile->w * tile->h;
   uint sample_offset = global_work_index / tile_pixels;
   uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#endif /* __KERNEL_CUDA__ */
+#else
+  /* Keeping threads for the same pixel together.
+   * Appears to improve performance by a few % on CUDA and OptiX. */
+  uint sample_offset = global_work_index % tile->num_samples;
+  uint pixel_offset = global_work_index / tile->num_samples;
+#endif
+
   uint y_offset = pixel_offset / tile->w;
   uint x_offset = pixel_offset - y_offset * tile->w;
 
@@ -47,71 +49,4 @@ ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
   *sample = tile->start_sample + sample_offset;
 }
 
-#ifdef __KERNEL_OPENCL__
-#  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#endif
-
-#ifdef __SPLIT_KERNEL__
-/* Returns true if there is work */
-ccl_device bool get_next_work_item(KernelGlobals *kg,
-                                   ccl_global uint *work_pools,
-                                   uint total_work_size,
-                                   uint ray_index,
-                                   ccl_private uint *global_work_index)
-{
-  /* With a small amount of work there may be more threads than work due to
-   * rounding up of global size, stop such threads immediately. */
-  if (ray_index >= total_work_size) {
-    return false;
-  }
-
-  /* Increase atomic work index counter in pool. */
-  uint pool = ray_index / WORK_POOL_SIZE;
-  uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]);
-
-  /* Map per-pool work index to a global work index. */
-  uint global_size = ccl_global_size(0) * ccl_global_size(1);
-  kernel_assert(global_size % WORK_POOL_SIZE == 0);
-  kernel_assert(ray_index < global_size);
-
-  *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + (pool * WORK_POOL_SIZE) +
-                       (work_index % WORK_POOL_SIZE);
-
-  /* Test if all work for this pool is done. */
-  return (*global_work_index < total_work_size);
-}
-
-ccl_device bool get_next_work(KernelGlobals *kg,
-                              ccl_global uint *work_pools,
-                              uint total_work_size,
-                              uint ray_index,
-                              ccl_private uint *global_work_index)
-{
-  bool got_work = false;
-  if (kernel_data.film.pass_adaptive_aux_buffer) {
-    do {
-      got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
-      if (got_work) {
-        ccl_global WorkTile *tile = &kernel_split_params.tile;
-        uint x, y, sample;
-        get_work_pixel(tile, *global_work_index, &x, &y, &sample);
-        uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
-        ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-        ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                       kernel_data.film.pass_adaptive_aux_buffer);
-        if ((*aux).w == 0.0f) {
-          break;
-        }
-      }
-    } while (got_work);
-  }
-  else {
-    got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
-  }
-  return got_work;
-}
-#endif
-
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_WORK_STEALING_H__ */