/* * Copyright 2011-2015 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef __KERNEL_WORK_STEALING_H__ #define __KERNEL_WORK_STEALING_H__ /* * Utility functions for work stealing */ #ifdef __WORK_STEALING__ #ifdef __KERNEL_OPENCL__ #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #endif uint get_group_id_with_ray_index(uint ray_index, uint tile_dim_x, uint tile_dim_y, uint parallel_samples, int dim) { if(dim == 0) { uint x_span = ray_index % (tile_dim_x * parallel_samples); return x_span / get_local_size(0); } else /*if(dim == 1)*/ { kernel_assert(dim == 1); uint y_span = ray_index / (tile_dim_x * parallel_samples); return y_span / get_local_size(1); } } uint get_total_work(uint tile_dim_x, uint tile_dim_y, uint grp_idx, uint grp_idy, uint num_samples) { uint threads_within_tile_border_x = (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) : get_local_size(0); uint threads_within_tile_border_y = (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) : get_local_size(1); threads_within_tile_border_x = (threads_within_tile_border_x == 0) ? get_local_size(0) : threads_within_tile_border_x; threads_within_tile_border_y = (threads_within_tile_border_y == 0) ? get_local_size(1) : threads_within_tile_border_y; return threads_within_tile_border_x * threads_within_tile_border_y * num_samples; } /* Returns 0 in case there is no next work available */ /* Returns 1 in case work assigned is valid */ int get_next_work(ccl_global uint *work_pool, ccl_private uint *my_work, uint tile_dim_x, uint tile_dim_y, uint num_samples, uint parallel_samples, uint ray_index) { uint grp_idx = get_group_id_with_ray_index(ray_index, tile_dim_x, tile_dim_y, parallel_samples, 0); uint grp_idy = get_group_id_with_ray_index(ray_index, tile_dim_x, tile_dim_y, parallel_samples, 1); uint total_work = get_total_work(tile_dim_x, tile_dim_y, grp_idx, grp_idy, num_samples); uint group_index = grp_idy * get_num_groups(0) + grp_idx; *my_work = atomic_inc(&work_pool[group_index]); return (*my_work < total_work) ? 1 : 0; } /* This function assumes that the passed my_work is valid. */ /* Decode sample number w.r.t. assigned my_work. */ uint get_my_sample(uint my_work, uint tile_dim_x, uint tile_dim_y, uint parallel_samples, uint ray_index) { uint grp_idx = get_group_id_with_ray_index(ray_index, tile_dim_x, tile_dim_y, parallel_samples, 0); uint grp_idy = get_group_id_with_ray_index(ray_index, tile_dim_x, tile_dim_y, parallel_samples, 1); uint threads_within_tile_border_x = (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) : get_local_size(0); uint threads_within_tile_border_y = (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) : get_local_size(1); threads_within_tile_border_x = (threads_within_tile_border_x == 0) ? get_local_size(0) : threads_within_tile_border_x; threads_within_tile_border_y = (threads_within_tile_border_y == 0) ? get_local_size(1) : threads_within_tile_border_y; return my_work / (threads_within_tile_border_x * threads_within_tile_border_y); } /* Decode pixel and tile position w.r.t. assigned my_work. */ void get_pixel_tile_position(ccl_private uint *pixel_x, ccl_private uint *pixel_y, ccl_private uint *tile_x, ccl_private uint *tile_y, uint my_work, uint tile_dim_x, uint tile_dim_y, uint tile_offset_x, uint tile_offset_y, uint parallel_samples, uint ray_index) { uint grp_idx = get_group_id_with_ray_index(ray_index, tile_dim_x, tile_dim_y, parallel_samples, 0); uint grp_idy = get_group_id_with_ray_index(ray_index, tile_dim_x, tile_dim_y, parallel_samples, 1); uint threads_within_tile_border_x = (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) : get_local_size(0); uint threads_within_tile_border_y = (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) : get_local_size(1); threads_within_tile_border_x = (threads_within_tile_border_x == 0) ? get_local_size(0) : threads_within_tile_border_x; threads_within_tile_border_y = (threads_within_tile_border_y == 0) ? get_local_size(1) : threads_within_tile_border_y; uint total_associated_pixels = threads_within_tile_border_x * threads_within_tile_border_y; uint work_group_pixel_index = my_work % total_associated_pixels; uint work_group_pixel_x = work_group_pixel_index % threads_within_tile_border_x; uint work_group_pixel_y = work_group_pixel_index / threads_within_tile_border_x; *pixel_x = tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x; *pixel_y = tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y; *tile_x = *pixel_x - tile_offset_x; *tile_y = *pixel_y - tile_offset_y; } #endif /* __WORK_STEALING__ */ #endif /* __KERNEL_WORK_STEALING_H__ */