diff options
Diffstat (limited to 'intern/cycles/kernel/kernel_work_stealing.h')
-rw-r--r-- | intern/cycles/kernel/kernel_work_stealing.h | 87 |
1 files changed, 11 insertions, 76 deletions
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h index d1602744f1d..fab0915c38e 100644 --- a/intern/cycles/kernel/kernel_work_stealing.h +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef __KERNEL_WORK_STEALING_H__ -#define __KERNEL_WORK_STEALING_H__ +#pragma once CCL_NAMESPACE_BEGIN @@ -24,21 +23,24 @@ CCL_NAMESPACE_BEGIN */ /* Map global work index to tile, pixel X/Y and sample. */ -ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile, +ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile, uint global_work_index, ccl_private uint *x, ccl_private uint *y, ccl_private uint *sample) { -#ifdef __KERNEL_CUDA__ - /* Keeping threads for the same pixel together improves performance on CUDA. */ - uint sample_offset = global_work_index % tile->num_samples; - uint pixel_offset = global_work_index / tile->num_samples; -#else /* __KERNEL_CUDA__ */ +#if 0 + /* Keep threads for the same sample together. */ uint tile_pixels = tile->w * tile->h; uint sample_offset = global_work_index / tile_pixels; uint pixel_offset = global_work_index - sample_offset * tile_pixels; -#endif /* __KERNEL_CUDA__ */ +#else + /* Keeping threads for the same pixel together. + * Appears to improve performance by a few % on CUDA and OptiX. */ + uint sample_offset = global_work_index % tile->num_samples; + uint pixel_offset = global_work_index / tile->num_samples; +#endif + uint y_offset = pixel_offset / tile->w; uint x_offset = pixel_offset - y_offset * tile->w; @@ -47,71 +49,4 @@ ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile, *sample = tile->start_sample + sample_offset; } -#ifdef __KERNEL_OPENCL__ -# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable -#endif - -#ifdef __SPLIT_KERNEL__ -/* Returns true if there is work */ -ccl_device bool get_next_work_item(KernelGlobals *kg, - ccl_global uint *work_pools, - uint total_work_size, - uint ray_index, - ccl_private uint *global_work_index) -{ - /* With a small amount of work there may be more threads than work due to - * rounding up of global size, stop such threads immediately. */ - if (ray_index >= total_work_size) { - return false; - } - - /* Increase atomic work index counter in pool. */ - uint pool = ray_index / WORK_POOL_SIZE; - uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]); - - /* Map per-pool work index to a global work index. */ - uint global_size = ccl_global_size(0) * ccl_global_size(1); - kernel_assert(global_size % WORK_POOL_SIZE == 0); - kernel_assert(ray_index < global_size); - - *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + (pool * WORK_POOL_SIZE) + - (work_index % WORK_POOL_SIZE); - - /* Test if all work for this pool is done. */ - return (*global_work_index < total_work_size); -} - -ccl_device bool get_next_work(KernelGlobals *kg, - ccl_global uint *work_pools, - uint total_work_size, - uint ray_index, - ccl_private uint *global_work_index) -{ - bool got_work = false; - if (kernel_data.film.pass_adaptive_aux_buffer) { - do { - got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index); - if (got_work) { - ccl_global WorkTile *tile = &kernel_split_params.tile; - uint x, y, sample; - get_work_pixel(tile, *global_work_index, &x, &y, &sample); - uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride; - ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; - ccl_global float4 *aux = (ccl_global float4 *)(buffer + - kernel_data.film.pass_adaptive_aux_buffer); - if ((*aux).w == 0.0f) { - break; - } - } - } while (got_work); - } - else { - got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index); - } - return got_work; -} -#endif - CCL_NAMESPACE_END - -#endif /* __KERNEL_WORK_STEALING_H__ */ |