diff options
Diffstat (limited to 'intern/cycles/kernel/kernel_work_stealing.h')
-rw-r--r-- | intern/cycles/kernel/kernel_work_stealing.h | 195 |
1 files changed, 42 insertions, 153 deletions
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h index 7d559b1aa31..0c2d9379b63 100644 --- a/intern/cycles/kernel/kernel_work_stealing.h +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -17,177 +17,66 @@ #ifndef __KERNEL_WORK_STEALING_H__ #define __KERNEL_WORK_STEALING_H__ +CCL_NAMESPACE_BEGIN + /* * Utility functions for work stealing */ -#ifdef __WORK_STEALING__ - #ifdef __KERNEL_OPENCL__ # pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #endif -uint get_group_id_with_ray_index(uint ray_index, - uint tile_dim_x, - uint tile_dim_y, - uint parallel_samples, - int dim) +#ifdef __SPLIT_KERNEL__ +/* Returns true if there is work */ +ccl_device bool get_next_work(KernelGlobals *kg, + ccl_global uint *work_pools, + uint total_work_size, + uint ray_index, + ccl_private uint *global_work_index) { - if(dim == 0) { - uint x_span = ray_index % (tile_dim_x * parallel_samples); - return x_span / get_local_size(0); + /* With a small amount of work there may be more threads than work due to + * rounding up of global size, stop such threads immediately. */ + if(ray_index >= total_work_size) { + return false; } - else /*if(dim == 1)*/ { - kernel_assert(dim == 1); - uint y_span = ray_index / (tile_dim_x * parallel_samples); - return y_span / get_local_size(1); - } -} - -uint get_total_work(uint tile_dim_x, - uint tile_dim_y, - uint grp_idx, - uint grp_idy, - uint num_samples) -{ - uint threads_within_tile_border_x = - (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) - : get_local_size(0); - uint threads_within_tile_border_y = - (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) - : get_local_size(1); - - threads_within_tile_border_x = - (threads_within_tile_border_x == 0) ? get_local_size(0) - : threads_within_tile_border_x; - threads_within_tile_border_y = - (threads_within_tile_border_y == 0) ? get_local_size(1) - : threads_within_tile_border_y; - - return threads_within_tile_border_x * - threads_within_tile_border_y * - num_samples; -} -/* Returns 0 in case there is no next work available */ -/* Returns 1 in case work assigned is valid */ -int get_next_work(ccl_global uint *work_pool, - ccl_private uint *my_work, - uint tile_dim_x, - uint tile_dim_y, - uint num_samples, - uint parallel_samples, - uint ray_index) -{ - uint grp_idx = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 0); - uint grp_idy = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 1); - uint total_work = get_total_work(tile_dim_x, - tile_dim_y, - grp_idx, - grp_idy, - num_samples); - uint group_index = grp_idy * get_num_groups(0) + grp_idx; - *my_work = atomic_inc(&work_pool[group_index]); - return (*my_work < total_work) ? 1 : 0; -} + /* Increase atomic work index counter in pool. */ + uint pool = ray_index / WORK_POOL_SIZE; + uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]); -/* This function assumes that the passed my_work is valid. */ -/* Decode sample number w.r.t. assigned my_work. */ -uint get_my_sample(uint my_work, - uint tile_dim_x, - uint tile_dim_y, - uint parallel_samples, - uint ray_index) -{ - uint grp_idx = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 0); - uint grp_idy = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 1); - uint threads_within_tile_border_x = - (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) - : get_local_size(0); - uint threads_within_tile_border_y = - (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) - : get_local_size(1); + /* Map per-pool work index to a global work index. */ + uint global_size = ccl_global_size(0) * ccl_global_size(1); + kernel_assert(global_size % WORK_POOL_SIZE == 0); + kernel_assert(ray_index < global_size); - threads_within_tile_border_x = - (threads_within_tile_border_x == 0) ? get_local_size(0) - : threads_within_tile_border_x; - threads_within_tile_border_y = - (threads_within_tile_border_y == 0) ? get_local_size(1) - : threads_within_tile_border_y; + *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + + (pool * WORK_POOL_SIZE) + + (work_index % WORK_POOL_SIZE); - return my_work / - (threads_within_tile_border_x * threads_within_tile_border_y); + /* Test if all work for this pool is done. */ + return (*global_work_index < total_work_size); } +#endif -/* Decode pixel and tile position w.r.t. assigned my_work. */ -void get_pixel_tile_position(ccl_private uint *pixel_x, - ccl_private uint *pixel_y, - ccl_private uint *tile_x, - ccl_private uint *tile_y, - uint my_work, - uint tile_dim_x, - uint tile_dim_y, - uint tile_offset_x, - uint tile_offset_y, - uint parallel_samples, - uint ray_index) +/* Map global work index to tile, pixel X/Y and sample. */ +ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile, + uint global_work_index, + ccl_private uint *x, + ccl_private uint *y, + ccl_private uint *sample) { - uint grp_idx = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 0); - uint grp_idy = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 1); - uint threads_within_tile_border_x = - (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) - : get_local_size(0); - uint threads_within_tile_border_y = - (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) - : get_local_size(1); - - threads_within_tile_border_x = - (threads_within_tile_border_x == 0) ? get_local_size(0) - : threads_within_tile_border_x; - threads_within_tile_border_y = - (threads_within_tile_border_y == 0) ? get_local_size(1) - : threads_within_tile_border_y; - - uint total_associated_pixels = - threads_within_tile_border_x * threads_within_tile_border_y; - uint work_group_pixel_index = my_work % total_associated_pixels; - uint work_group_pixel_x = - work_group_pixel_index % threads_within_tile_border_x; - uint work_group_pixel_y = - work_group_pixel_index / threads_within_tile_border_x; - - *pixel_x = - tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x; - *pixel_y = - tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y; - *tile_x = *pixel_x - tile_offset_x; - *tile_y = *pixel_y - tile_offset_y; + uint tile_pixels = tile->w * tile->h; + uint sample_offset = global_work_index / tile_pixels; + uint pixel_offset = global_work_index - sample_offset * tile_pixels; + uint y_offset = pixel_offset / tile->w; + uint x_offset = pixel_offset - y_offset * tile->w; + + *x = tile->x + x_offset; + *y = tile->y + y_offset; + *sample = tile->start_sample + sample_offset; } -#endif /* __WORK_STEALING__ */ +CCL_NAMESPACE_END #endif /* __KERNEL_WORK_STEALING_H__ */ |