From 07ec0effb61e18a3d2f1bad97ebf7f6cb5bb6b87 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Thu, 21 Sep 2017 03:37:22 +0200 Subject: Code cleanup: simplify kernel side work stealing code. --- intern/cycles/kernel/kernel_work_stealing.h | 112 +++++++-------------- intern/cycles/kernel/split/kernel_buffer_update.h | 37 +++---- ..._holdout_emission_blurring_pathtermination_ao.h | 13 +-- intern/cycles/kernel/split/kernel_path_init.h | 27 ++--- .../cycles/kernel/split/kernel_split_data_types.h | 2 +- 5 files changed, 66 insertions(+), 125 deletions(-) (limited to 'intern') diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h index 28fc5ce1c30..0c11158e8da 100644 --- a/intern/cycles/kernel/kernel_work_stealing.h +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -27,90 +27,54 @@ CCL_NAMESPACE_BEGIN # pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #endif -ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg) -{ - return kernel_split_params.w * kernel_split_params.h * kernel_split_params.num_samples; -} - -ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg) -{ - return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE; -} - -ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint ray_index) -{ - return ray_index / WORK_POOL_SIZE; -} - -ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool) -{ - uint total_work_size = kernel_total_work_size(kg); - uint num_pools = kernel_num_work_pools(kg); - - if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= total_work_size) { - return 0; - } - - uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * WORK_POOL_SIZE; - - uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE)); - if(work_pool < remainder / WORK_POOL_SIZE) { - work_size += WORK_POOL_SIZE; - } - else if(work_pool == remainder / WORK_POOL_SIZE) { - work_size += remainder % WORK_POOL_SIZE; - } - - return work_size; -} - -ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint work_index, uint ray_index) -{ - uint num_pools = kernel_num_work_pools(kg); - uint pool = work_pool_from_ray_index(kg, ray_index); - - return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE) - + (pool * WORK_POOL_SIZE) - + (work_index % WORK_POOL_SIZE); -} - /* Returns true if there is work */ -ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, uint ray_index) +ccl_device bool get_next_work(KernelGlobals *kg, + uint thread_index, + ccl_private uint *global_work_index) { - uint work_pool = work_pool_from_ray_index(kg, ray_index); - uint pool_size = work_pool_work_size(kg, work_pool); + uint total_work_size = kernel_split_params.w + * kernel_split_params.h + * kernel_split_params.num_samples; - if(pool_size == 0) { + /* With a small amount of work there may be more threads than work due to + * rounding up of global size, stop such threads immediately. */ + if(thread_index >= total_work_size) { return false; } - *work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]); - return (*work_index < pool_size); -} + /* Increase atomic work index counter in pool. */ + uint pool = thread_index / WORK_POOL_SIZE; + uint work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[pool]); -/* This function assumes that the passed `work` is valid. */ -/* Decode sample number w.r.t. assigned `work`. */ -ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint ray_index) -{ - return get_global_work_index(kg, work_index, ray_index) / (kernel_split_params.w * kernel_split_params.h); -} + /* Map per-pool work index to a global work index. */ + uint global_size = ccl_global_size(0) * ccl_global_size(1); + kernel_assert(global_size % WORK_POOL_SIZE == 0); + kernel_assert(thread_index < global_size); -/* Decode pixel and tile position w.r.t. assigned `work`. */ -ccl_device void get_work_pixel_tile_position(KernelGlobals *kg, - ccl_private uint *pixel_x, - ccl_private uint *pixel_y, - ccl_private uint *tile_x, - ccl_private uint *tile_y, - uint work_index, - uint ray_index) -{ - uint pixel_index = get_global_work_index(kg, work_index, ray_index) % (kernel_split_params.w*kernel_split_params.h); + *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + + (pool * WORK_POOL_SIZE) + + (work_index % WORK_POOL_SIZE); - *tile_x = pixel_index % kernel_split_params.w; - *tile_y = pixel_index / kernel_split_params.w; + /* Test if all work for this pool is done. */ + return (*global_work_index < total_work_size); +} - *pixel_x = *tile_x + kernel_split_params.x; - *pixel_y = *tile_y + kernel_split_params.y; +/* Map global work index to pixel X/Y and sample. */ +ccl_device_inline void get_work_pixel(KernelGlobals *kg, + uint global_work_index, + ccl_private uint *x, + ccl_private uint *y, + ccl_private uint *sample) +{ + uint tile_pixels = kernel_split_params.w * kernel_split_params.h; + uint sample_offset = global_work_index / tile_pixels; + uint pixel_offset = global_work_index - sample_offset * tile_pixels; + uint y_offset = pixel_offset / kernel_split_params.w; + uint x_offset = pixel_offset - y_offset * kernel_split_params.w; + + *x = kernel_split_params.x + x_offset; + *y = kernel_split_params.y + y_offset; + *sample = kernel_split_params.start_sample + sample_offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h index 7b4d1299c12..c9e7deddafa 100644 --- a/intern/cycles/kernel/split/kernel_buffer_update.h +++ b/intern/cycles/kernel/split/kernel_buffer_update.h @@ -84,14 +84,9 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - uint work_index = kernel_split_state.work_array[ray_index]; - uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; - - uint tile_x, tile_y, pixel_x, pixel_y; - get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index); - - ccl_global float *buffer = kernel_split_params.buffer; - buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; + uint sample = state->sample; + uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; + ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; /* accumulate result in output buffer */ kernel_write_result(kg, buffer, sample, L); @@ -102,31 +97,26 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { /* We have completed current work; So get next work */ uint work_index; - int valid_work = get_next_work(kg, &work_index, ray_index); - if(!valid_work) { + if(!get_next_work(kg, ray_index, &work_index)) { /* If work is invalid, this means no more work is available and the thread may exit */ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); } if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { - kernel_split_state.work_array[ray_index] = work_index; - /* Get the sample associated with the current work */ - uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; - /* Get pixel and tile position associated with current work */ - uint tile_x, tile_y, pixel_x, pixel_y; - get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index); - - /* Remap rng_state according to the current work */ + uint x, y, sample; + get_work_pixel(kg, work_index, &x, &y, &sample); + + /* Remap rng_state to current pixel. */ ccl_global uint *rng_state = kernel_split_params.rng_state; - rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride; + rng_state += kernel_split_params.offset + x + y*stride; - /* Remap buffer according to the current work */ - ccl_global float *buffer = kernel_split_params.buffer; - buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; + /* Store buffer offset for writing to passes. */ + uint buffer_offset = (kernel_split_params.offset + x + y*stride) * kernel_data.film.pass_stride; + kernel_split_state.buffer_offset[ray_index] = buffer_offset; /* Initialize random numbers and ray. */ uint rng_hash; - kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, &rng_hash, ray); + kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, ray); if(ray->t != 0.0f) { /* Initialize throughput, path radiance, Ray, PathState; @@ -145,6 +135,7 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, /* These rays do not participate in path-iteration. */ float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); /* Accumulate result in output buffer. */ + ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; kernel_write_pass_float4(buffer, sample, L_rad); ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index 4d9e08becc4..dffd291012d 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -90,8 +90,6 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( if(ray_index != QUEUE_EMPTY_SLOT) { #endif - int stride = kernel_split_params.stride; - ccl_global PathState *state = 0x0; float3 throughput; @@ -99,15 +97,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( ShaderData *sd = &kernel_split_state.sd[ray_index]; if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - uint work_index = kernel_split_state.work_array[ray_index]; - uint pixel_x, pixel_y, tile_x, tile_y; - get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, - &tile_x, &tile_y, - work_index, - ray_index); - - ccl_global float *buffer = kernel_split_params.buffer; - buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride; + uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; + ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h index c75931855b2..0ab2289348b 100644 --- a/intern/cycles/kernel/split/kernel_path_init.h +++ b/intern/cycles/kernel/split/kernel_path_init.h @@ -29,38 +29,32 @@ ccl_device void kernel_path_init(KernelGlobals *kg) { */ kernel_split_state.ray_state[ray_index] = RAY_ACTIVE; - uint work_index = 0; /* Get work. */ - if(!get_next_work(kg, &work_index, ray_index)) { + uint work_index; + if(!get_next_work(kg, ray_index, &work_index)) { /* No more work, mark ray as inactive */ kernel_split_state.ray_state[ray_index] = RAY_INACTIVE; return; } - /* Get the sample associated with the work. */ - uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; - - /* Get pixel and tile position associated with the work. */ - uint pixel_x, pixel_y, tile_x, tile_y; - get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, - &tile_x, &tile_y, - work_index, - ray_index); - kernel_split_state.work_array[ray_index] = work_index; + uint x, y, sample; + get_work_pixel(kg, work_index, &x, &y, &sample); + /* Remap rng_state and buffer to current pixel. */ ccl_global uint *rng_state = kernel_split_params.rng_state; - rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride; + rng_state += kernel_split_params.offset + x + y*kernel_split_params.stride; - ccl_global float *buffer = kernel_split_params.buffer; - buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride; + /* Store buffer offset for writing to passes. */ + uint buffer_offset = (kernel_split_params.offset + x + y*kernel_split_params.stride) * kernel_data.film.pass_stride; + kernel_split_state.buffer_offset[ray_index] = buffer_offset; /* Initialize random numbers and ray. */ uint rng_hash; kernel_path_trace_setup(kg, rng_state, sample, - pixel_x, pixel_y, + x, y, &rng_hash, &kernel_split_state.ray[ray_index]); @@ -84,6 +78,7 @@ ccl_device void kernel_path_init(KernelGlobals *kg) { /* These rays do not participate in path-iteration. */ float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); /* Accumulate result in output buffer. */ + ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; kernel_write_pass_float4(buffer, sample, L_rad); ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE); } diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h index e08afc22b20..c58c8463f5c 100644 --- a/intern/cycles/kernel/split/kernel_split_data_types.h +++ b/intern/cycles/kernel/split/kernel_split_data_types.h @@ -122,7 +122,7 @@ typedef ccl_global struct SplitBranchedState { SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \ - SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \ + SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \ SPLIT_DATA_ENTRY(ShaderData, sd, 1) \ SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \ SPLIT_DATA_SUBSURFACE_ENTRIES \ -- cgit v1.2.3