diff options
Diffstat (limited to 'intern/cycles/kernel/device/gpu/parallel_sorted_index.h')
-rw-r--r-- | intern/cycles/kernel/device/gpu/parallel_sorted_index.h | 26 |
1 files changed, 18 insertions, 8 deletions
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h index 9bca1fad22f..c092e2a21ee 100644 --- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h @@ -24,7 +24,7 @@ CCL_NAMESPACE_BEGIN * * TODO: there may be ways to optimize this to avoid this many atomic ops? */ -#include "util/util_atomic.h" +#include "util/atomic.h" #ifdef __HIP__ # define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024 @@ -33,20 +33,30 @@ CCL_NAMESPACE_BEGIN #endif #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0) -template<uint blocksize, typename GetKeyOp> -__device__ void gpu_parallel_sorted_index_array(const uint num_states, - int *indices, - int *num_indices, - int *key_prefix_sum, +template<typename GetKeyOp> +__device__ void gpu_parallel_sorted_index_array(const uint state_index, + const uint num_states, + const int num_states_limit, + ccl_global int *indices, + ccl_global int *num_indices, + ccl_global int *key_counter, + ccl_global int *key_prefix_sum, GetKeyOp get_key_op) { - const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x; const int key = (state_index < num_states) ? get_key_op(state_index) : GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY; if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) { const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1); - indices[index] = state_index; + if (index < num_states_limit) { + /* Assign state index. */ + indices[index] = state_index; + } + else { + /* Can't process this state now, increase the counter again so that + * it will be handled in another iteration. */ + atomic_fetch_and_add_uint32(&key_counter[key], 1); + } } } |