From 07ec0effb61e18a3d2f1bad97ebf7f6cb5bb6b87 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Thu, 21 Sep 2017 03:37:22 +0200
Subject: Code cleanup: simplify kernel side work stealing code.

---
 intern/cycles/kernel/kernel_work_stealing.h | 112 ++++++++++------------------
 1 file changed, 38 insertions(+), 74 deletions(-)

(limited to 'intern/cycles/kernel/kernel_work_stealing.h')

diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 28fc5ce1c30..0c11158e8da 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -27,90 +27,54 @@ CCL_NAMESPACE_BEGIN
 #  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #endif
 
-ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg)
-{
-	return kernel_split_params.w * kernel_split_params.h * kernel_split_params.num_samples;
-}
-
-ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg)
-{
-	return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE;
-}
-
-ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint ray_index)
-{
-	return ray_index / WORK_POOL_SIZE;
-}
-
-ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool)
-{
-	uint total_work_size = kernel_total_work_size(kg);
-	uint num_pools = kernel_num_work_pools(kg);
-
-	if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= total_work_size) {
-		return 0;
-	}
-
-	uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * WORK_POOL_SIZE;
-
-	uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE));
-	if(work_pool < remainder / WORK_POOL_SIZE) {
-		work_size += WORK_POOL_SIZE;
-	}
-	else if(work_pool == remainder / WORK_POOL_SIZE) {
-		work_size += remainder % WORK_POOL_SIZE;
-	}
-
-	return work_size;
-}
-
-ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint work_index, uint ray_index)
-{
-	uint num_pools = kernel_num_work_pools(kg);
-	uint pool = work_pool_from_ray_index(kg, ray_index);
-
-	return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE)
-	       + (pool * WORK_POOL_SIZE)
-	       + (work_index % WORK_POOL_SIZE);
-}
-
 /* Returns true if there is work */
-ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, uint ray_index)
+ccl_device bool get_next_work(KernelGlobals *kg,
+                              uint thread_index,
+                              ccl_private uint *global_work_index)
 {
-	uint work_pool = work_pool_from_ray_index(kg, ray_index);
-	uint pool_size = work_pool_work_size(kg, work_pool);
+	uint total_work_size = kernel_split_params.w
+	                     * kernel_split_params.h
+	                     * kernel_split_params.num_samples;
 
-	if(pool_size == 0) {
+	/* With a small amount of work there may be more threads than work due to
+	 * rounding up of global size, stop such threads immediately. */
+	if(thread_index >= total_work_size) {
 		return false;
 	}
 
-	*work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]);
-	return (*work_index < pool_size);
-}
+	/* Increase atomic work index counter in pool. */
+	uint pool = thread_index / WORK_POOL_SIZE;
+	uint work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[pool]);
 
-/* This function assumes that the passed `work` is valid. */
-/* Decode sample number w.r.t. assigned `work`. */
-ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint ray_index)
-{
-	return get_global_work_index(kg, work_index, ray_index) / (kernel_split_params.w * kernel_split_params.h);
-}
+	/* Map per-pool work index to a global work index. */
+	uint global_size = ccl_global_size(0) * ccl_global_size(1);
+	kernel_assert(global_size % WORK_POOL_SIZE == 0);
+	kernel_assert(thread_index < global_size);
 
-/* Decode pixel and tile position w.r.t. assigned `work`. */
-ccl_device void get_work_pixel_tile_position(KernelGlobals *kg,
-                             ccl_private uint *pixel_x,
-                             ccl_private uint *pixel_y,
-                             ccl_private uint *tile_x,
-                             ccl_private uint *tile_y,
-                             uint work_index,
-                             uint ray_index)
-{
-	uint pixel_index = get_global_work_index(kg, work_index, ray_index) % (kernel_split_params.w*kernel_split_params.h);
+	*global_work_index = (work_index / WORK_POOL_SIZE) * global_size
+	                   + (pool * WORK_POOL_SIZE)
+	                   + (work_index % WORK_POOL_SIZE);
 
-	*tile_x = pixel_index % kernel_split_params.w;
-	*tile_y = pixel_index / kernel_split_params.w;
+	/* Test if all work for this pool is done. */
+	return (*global_work_index < total_work_size);
+}
 
-	*pixel_x = *tile_x + kernel_split_params.x;
-	*pixel_y = *tile_y + kernel_split_params.y;
+/* Map global work index to pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(KernelGlobals *kg,
+                                      uint global_work_index,
+                                      ccl_private uint *x,
+                                      ccl_private uint *y,
+                                      ccl_private uint *sample)
+{
+	uint tile_pixels = kernel_split_params.w * kernel_split_params.h;
+	uint sample_offset = global_work_index / tile_pixels;
+	uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+	uint y_offset = pixel_offset / kernel_split_params.w;
+	uint x_offset = pixel_offset - y_offset * kernel_split_params.w;
+
+	*x = kernel_split_params.x + x_offset;
+	*y = kernel_split_params.y + y_offset;
+	*sample = kernel_split_params.start_sample + sample_offset;
 }
 
 CCL_NAMESPACE_END
-- 
cgit v1.2.3