diff options
Diffstat (limited to 'intern/cycles/kernel/device/gpu/parallel_active_index.h')
-rw-r--r-- | intern/cycles/kernel/device/gpu/parallel_active_index.h | 100 |
1 files changed, 93 insertions, 7 deletions
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index 7d7266d5edf..c1df49c4f49 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -18,15 +18,68 @@ CCL_NAMESPACE_BEGIN # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 #endif -#ifndef __KERNEL_METAL__ +/* TODO: abstract more device differences, define ccl_gpu_local_syncthreads, + * ccl_gpu_thread_warp, ccl_gpu_warp_index, ccl_gpu_num_warps for all devices + * and keep device specific code in compat.h */ + +#ifdef __KERNEL_ONEAPI__ +# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED +template<typename IsActiveOp> +void cpu_serial_active_index_array_impl(const uint num_states, + ccl_global int *ccl_restrict indices, + ccl_global int *ccl_restrict num_indices, + IsActiveOp is_active_op) +{ + int write_index = 0; + for (int state_index = 0; state_index < num_states; state_index++) { + if (is_active_op(state_index)) + indices[write_index++] = state_index; + } + *num_indices = write_index; + return; +} +# endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */ + +template<typename IsActiveOp> +void gpu_parallel_active_index_array_impl(const uint num_states, + ccl_global int *ccl_restrict indices, + ccl_global int *ccl_restrict num_indices, + IsActiveOp is_active_op) +{ + const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>(); + const uint blocksize = item_id.get_local_range(0); + + sycl::multi_ptr<int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1], + sycl::access::address_space::local_space> + ptr = sycl::ext::oneapi::group_local_memory< + int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1]>(item_id.get_group()); + int *warp_offset = *ptr; + + /* NOTE(@nsirgien): Here we calculate the same value as below but + * faster for DPC++ : seems CUDA converting "%", "/", "*" based calculations below into + * something faster already but DPC++ doesn't, so it's better to use + * direct request of needed parameters - switching from this computation to computation below + * will cause 2.5x performance slowdown. */ + const uint thread_index = item_id.get_local_id(0); + const uint thread_warp = item_id.get_sub_group().get_local_id(); + + const uint warp_index = item_id.get_sub_group().get_group_id(); + const uint num_warps = item_id.get_sub_group().get_group_range()[0]; + + const uint state_index = item_id.get_global_id(0); + + /* Test if state corresponding to this thread is active. */ + const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; +#else /* !__KERNEL__ONEAPI__ */ +# ifndef __KERNEL_METAL__ template<uint blocksize, typename IsActiveOp> __device__ -#endif +# endif void gpu_parallel_active_index_array_impl(const uint num_states, ccl_global int *indices, ccl_global int *num_indices, -#ifdef __KERNEL_METAL__ +# ifdef __KERNEL_METAL__ const uint is_active, const uint blocksize, const int thread_index, @@ -37,7 +90,7 @@ __device__ const int num_warps, threadgroup int *warp_offset) { -#else +# else IsActiveOp is_active_op) { extern ccl_gpu_shared int warp_offset[]; @@ -52,18 +105,33 @@ __device__ /* Test if state corresponding to this thread is active. */ const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; -#endif - +# endif +#endif /* !__KERNEL_ONEAPI__ */ /* For each thread within a warp compute how many other active states precede it. */ +#ifdef __KERNEL_ONEAPI__ + const uint thread_offset = sycl::exclusive_scan_over_group( + item_id.get_sub_group(), is_active, std::plus<>()); +#else const uint thread_offset = popcount(ccl_gpu_ballot(is_active) & ccl_gpu_thread_mask(thread_warp)); +#endif /* Last thread in warp stores number of active states for each warp. */ +#ifdef __KERNEL_ONEAPI__ + if (thread_warp == item_id.get_sub_group().get_local_range()[0] - 1) { +#else if (thread_warp == ccl_gpu_warp_size - 1) { +#endif warp_offset[warp_index] = thread_offset + is_active; } +#ifdef __KERNEL_ONEAPI__ + /* NOTE(@nsirgien): For us here only local memory writing (warp_offset) is important, + * so faster local barriers can be used. */ + ccl_gpu_local_syncthreads(); +#else ccl_gpu_syncthreads(); +#endif /* Last thread in block converts per-warp sizes to offsets, increments global size of * index array and gets offset to write to. */ @@ -80,7 +148,13 @@ __device__ warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); } +#ifdef __KERNEL_ONEAPI__ + /* NOTE(@nsirgien): For us here only important local memory writing (warp_offset), + * so faster local barriers can be used. */ + ccl_gpu_local_syncthreads(); +#else ccl_gpu_syncthreads(); +#endif /* Write to index array. */ if (is_active) { @@ -107,7 +181,19 @@ __device__ simd_group_index, \ num_simd_groups, \ simdgroup_offset) - +#elif defined(__KERNEL_ONEAPI__) +# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED +# define gpu_parallel_active_index_array( \ + blocksize, num_states, indices, num_indices, is_active_op) \ + if (ccl_gpu_global_size_x() == 1) \ + cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \ + else \ + gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op); +# else +# define gpu_parallel_active_index_array( \ + blocksize, num_states, indices, num_indices, is_active_op) \ + gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op) +# endif #else # define gpu_parallel_active_index_array( \ |