diff options
Diffstat (limited to 'intern/cycles/kernel/kernel_work_stealing.h')
-rw-r--r-- | intern/cycles/kernel/kernel_work_stealing.h | 193 |
1 files changed, 193 insertions, 0 deletions
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h new file mode 100644 index 00000000000..9b83d972e97 --- /dev/null +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -0,0 +1,193 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_WORK_STEALING_H__ +#define __KERNEL_WORK_STEALING_H__ + +/* + * Utility functions for work stealing + */ + +#ifdef __WORK_STEALING__ + +#ifdef __KERNEL_OPENCL__ +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#endif + +uint get_group_id_with_ray_index(uint ray_index, + uint tile_dim_x, + uint tile_dim_y, + uint parallel_samples, + int dim) +{ + if(dim == 0) { + uint x_span = ray_index % (tile_dim_x * parallel_samples); + return x_span / get_local_size(0); + } + else /*if(dim == 1)*/ { + kernel_assert(dim == 1); + uint y_span = ray_index / (tile_dim_x * parallel_samples); + return y_span / get_local_size(1); + } +} + +uint get_total_work(uint tile_dim_x, + uint tile_dim_y, + uint grp_idx, + uint grp_idy, + uint num_samples) +{ + uint threads_within_tile_border_x = + (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) + : get_local_size(0); + uint threads_within_tile_border_y = + (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) + : get_local_size(1); + + threads_within_tile_border_x = + (threads_within_tile_border_x == 0) ? get_local_size(0) + : threads_within_tile_border_x; + threads_within_tile_border_y = + (threads_within_tile_border_y == 0) ? get_local_size(1) + : threads_within_tile_border_y; + + return threads_within_tile_border_x * + threads_within_tile_border_y * + num_samples; +} + +/* Returns 0 in case there is no next work available */ +/* Returns 1 in case work assigned is valid */ +int get_next_work(ccl_global uint *work_pool, + ccl_private uint *my_work, + uint tile_dim_x, + uint tile_dim_y, + uint num_samples, + uint parallel_samples, + uint ray_index) +{ + uint grp_idx = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 0); + uint grp_idy = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 1); + uint total_work = get_total_work(tile_dim_x, + tile_dim_y, + grp_idx, + grp_idy, + num_samples); + uint group_index = grp_idy * get_num_groups(0) + grp_idx; + *my_work = atomic_inc(&work_pool[group_index]); + return (*my_work < total_work) ? 1 : 0; +} + +/* This function assumes that the passed my_work is valid. */ +/* Decode sample number w.r.t. assigned my_work. */ +uint get_my_sample(uint my_work, + uint tile_dim_x, + uint tile_dim_y, + uint parallel_samples, + uint ray_index) +{ + uint grp_idx = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 0); + uint grp_idy = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 1); + uint threads_within_tile_border_x = + (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) + : get_local_size(0); + uint threads_within_tile_border_y = + (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) + : get_local_size(1); + + threads_within_tile_border_x = + (threads_within_tile_border_x == 0) ? get_local_size(0) + : threads_within_tile_border_x; + threads_within_tile_border_y = + (threads_within_tile_border_y == 0) ? get_local_size(1) + : threads_within_tile_border_y; + + return my_work / + (threads_within_tile_border_x * threads_within_tile_border_y); +} + +/* Decode pixel and tile position w.r.t. assigned my_work. */ +void get_pixel_tile_position(ccl_private uint *pixel_x, + ccl_private uint *pixel_y, + ccl_private uint *tile_x, + ccl_private uint *tile_y, + uint my_work, + uint tile_dim_x, + uint tile_dim_y, + uint tile_offset_x, + uint tile_offset_y, + uint parallel_samples, + uint ray_index) +{ + uint grp_idx = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 0); + uint grp_idy = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 1); + uint threads_within_tile_border_x = + (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) + : get_local_size(0); + uint threads_within_tile_border_y = + (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) + : get_local_size(1); + + threads_within_tile_border_x = + (threads_within_tile_border_x == 0) ? get_local_size(0) + : threads_within_tile_border_x; + threads_within_tile_border_y = + (threads_within_tile_border_y == 0) ? get_local_size(1) + : threads_within_tile_border_y; + + uint total_associated_pixels = + threads_within_tile_border_x * threads_within_tile_border_y; + uint work_group_pixel_index = my_work % total_associated_pixels; + uint work_group_pixel_x = + work_group_pixel_index % threads_within_tile_border_x; + uint work_group_pixel_y = + work_group_pixel_index / threads_within_tile_border_x; + + *pixel_x = + tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x; + *pixel_y = + tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y; + *tile_x = *pixel_x - tile_offset_x; + *tile_y = *pixel_y - tile_offset_y; +} + +#endif /* __WORK_STEALING__ */ + +#endif /* __KERNEL_WORK_STEALING_H__ */ |