Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel/device/gpu/parallel_active_index.h')
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_active_index.h100
1 files changed, 93 insertions, 7 deletions
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index 7d7266d5edf..c1df49c4f49 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -18,15 +18,68 @@ CCL_NAMESPACE_BEGIN
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
#endif
-#ifndef __KERNEL_METAL__
+/* TODO: abstract more device differences, define ccl_gpu_local_syncthreads,
+ * ccl_gpu_thread_warp, ccl_gpu_warp_index, ccl_gpu_num_warps for all devices
+ * and keep device specific code in compat.h */
+
+#ifdef __KERNEL_ONEAPI__
+# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+template<typename IsActiveOp>
+void cpu_serial_active_index_array_impl(const uint num_states,
+ ccl_global int *ccl_restrict indices,
+ ccl_global int *ccl_restrict num_indices,
+ IsActiveOp is_active_op)
+{
+ int write_index = 0;
+ for (int state_index = 0; state_index < num_states; state_index++) {
+ if (is_active_op(state_index))
+ indices[write_index++] = state_index;
+ }
+ *num_indices = write_index;
+ return;
+}
+# endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */
+
+template<typename IsActiveOp>
+void gpu_parallel_active_index_array_impl(const uint num_states,
+ ccl_global int *ccl_restrict indices,
+ ccl_global int *ccl_restrict num_indices,
+ IsActiveOp is_active_op)
+{
+ const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>();
+ const uint blocksize = item_id.get_local_range(0);
+
+ sycl::multi_ptr<int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1],
+ sycl::access::address_space::local_space>
+ ptr = sycl::ext::oneapi::group_local_memory<
+ int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1]>(item_id.get_group());
+ int *warp_offset = *ptr;
+
+ /* NOTE(@nsirgien): Here we calculate the same value as below but
+ * faster for DPC++ : seems CUDA converting "%", "/", "*" based calculations below into
+ * something faster already but DPC++ doesn't, so it's better to use
+ * direct request of needed parameters - switching from this computation to computation below
+ * will cause 2.5x performance slowdown. */
+ const uint thread_index = item_id.get_local_id(0);
+ const uint thread_warp = item_id.get_sub_group().get_local_id();
+
+ const uint warp_index = item_id.get_sub_group().get_group_id();
+ const uint num_warps = item_id.get_sub_group().get_group_range()[0];
+
+ const uint state_index = item_id.get_global_id(0);
+
+ /* Test if state corresponding to this thread is active. */
+ const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#else /* !__KERNEL__ONEAPI__ */
+# ifndef __KERNEL_METAL__
template<uint blocksize, typename IsActiveOp>
__device__
-#endif
+# endif
void
gpu_parallel_active_index_array_impl(const uint num_states,
ccl_global int *indices,
ccl_global int *num_indices,
-#ifdef __KERNEL_METAL__
+# ifdef __KERNEL_METAL__
const uint is_active,
const uint blocksize,
const int thread_index,
@@ -37,7 +90,7 @@ __device__
const int num_warps,
threadgroup int *warp_offset)
{
-#else
+# else
IsActiveOp is_active_op)
{
extern ccl_gpu_shared int warp_offset[];
@@ -52,18 +105,33 @@ __device__
/* Test if state corresponding to this thread is active. */
const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
-#endif
-
+# endif
+#endif /* !__KERNEL_ONEAPI__ */
/* For each thread within a warp compute how many other active states precede it. */
+#ifdef __KERNEL_ONEAPI__
+ const uint thread_offset = sycl::exclusive_scan_over_group(
+ item_id.get_sub_group(), is_active, std::plus<>());
+#else
const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
ccl_gpu_thread_mask(thread_warp));
+#endif
/* Last thread in warp stores number of active states for each warp. */
+#ifdef __KERNEL_ONEAPI__
+ if (thread_warp == item_id.get_sub_group().get_local_range()[0] - 1) {
+#else
if (thread_warp == ccl_gpu_warp_size - 1) {
+#endif
warp_offset[warp_index] = thread_offset + is_active;
}
+#ifdef __KERNEL_ONEAPI__
+ /* NOTE(@nsirgien): For us here only local memory writing (warp_offset) is important,
+ * so faster local barriers can be used. */
+ ccl_gpu_local_syncthreads();
+#else
ccl_gpu_syncthreads();
+#endif
/* Last thread in block converts per-warp sizes to offsets, increments global size of
* index array and gets offset to write to. */
@@ -80,7 +148,13 @@ __device__
warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
}
+#ifdef __KERNEL_ONEAPI__
+ /* NOTE(@nsirgien): For us here only important local memory writing (warp_offset),
+ * so faster local barriers can be used. */
+ ccl_gpu_local_syncthreads();
+#else
ccl_gpu_syncthreads();
+#endif
/* Write to index array. */
if (is_active) {
@@ -107,7 +181,19 @@ __device__
simd_group_index, \
num_simd_groups, \
simdgroup_offset)
-
+#elif defined(__KERNEL_ONEAPI__)
+# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+# define gpu_parallel_active_index_array( \
+ blocksize, num_states, indices, num_indices, is_active_op) \
+ if (ccl_gpu_global_size_x() == 1) \
+ cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \
+ else \
+ gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op);
+# else
+# define gpu_parallel_active_index_array( \
+ blocksize, num_states, indices, num_indices, is_active_op) \
+ gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
+# endif
#else
# define gpu_parallel_active_index_array( \