Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel/device/gpu/parallel_active_index.h')
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_active_index.h114
1 files changed, 83 insertions, 31 deletions
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index d7416beb783..f667ede2712 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -31,10 +31,43 @@ CCL_NAMESPACE_BEGIN
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
#endif
+#ifdef __KERNEL_METAL__
+struct ActiveIndexContext {
+ ActiveIndexContext(int _thread_index,
+ int _global_index,
+ int _threadgroup_size,
+ int _simdgroup_size,
+ int _simd_lane_index,
+ int _simd_group_index,
+ int _num_simd_groups,
+ threadgroup int *_simdgroup_offset)
+ : thread_index(_thread_index),
+ global_index(_global_index),
+ blocksize(_threadgroup_size),
+ ccl_gpu_warp_size(_simdgroup_size),
+ thread_warp(_simd_lane_index),
+ warp_index(_simd_group_index),
+ num_warps(_num_simd_groups),
+ warp_offset(_simdgroup_offset)
+ {
+ }
+
+ const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
+ num_warps;
+ threadgroup int *warp_offset;
+
+ template<uint blocksizeDummy, typename IsActiveOp>
+ void active_index_array(const uint num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ IsActiveOp is_active_op)
+ {
+ const uint state_index = global_index;
+#else
template<uint blocksize, typename IsActiveOp>
__device__ void gpu_parallel_active_index_array(const uint num_states,
- int *indices,
- int *num_indices,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
IsActiveOp is_active_op)
{
extern ccl_gpu_shared int warp_offset[];
@@ -45,43 +78,62 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
const uint warp_index = thread_index / ccl_gpu_warp_size;
const uint num_warps = blocksize / ccl_gpu_warp_size;
- /* Test if state corresponding to this thread is active. */
const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
- const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#endif
- /* For each thread within a warp compute how many other active states precede it. */
- const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
- const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+ /* Test if state corresponding to this thread is active. */
+ const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
- /* Last thread in warp stores number of active states for each warp. */
- if (thread_warp == ccl_gpu_warp_size - 1) {
- warp_offset[warp_index] = thread_offset + is_active;
- }
+ /* For each thread within a warp compute how many other active states precede it. */
+ const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) &
+ ccl_gpu_thread_mask(thread_warp));
- ccl_gpu_syncthreads();
-
- /* Last thread in block converts per-warp sizes to offsets, increments global size of
- * index array and gets offset to write to. */
- if (thread_index == blocksize - 1) {
- /* TODO: parallelize this. */
- int offset = 0;
- for (int i = 0; i < num_warps; i++) {
- int num_active = warp_offset[i];
- warp_offset[i] = offset;
- offset += num_active;
+ /* Last thread in warp stores number of active states for each warp. */
+ if (thread_warp == ccl_gpu_warp_size - 1) {
+ warp_offset[warp_index] = thread_offset + is_active;
}
- const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
- warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
- }
+ ccl_gpu_syncthreads();
+
+ /* Last thread in block converts per-warp sizes to offsets, increments global size of
+ * index array and gets offset to write to. */
+ if (thread_index == blocksize - 1) {
+ /* TODO: parallelize this. */
+ int offset = 0;
+ for (int i = 0; i < num_warps; i++) {
+ int num_active = warp_offset[i];
+ warp_offset[i] = offset;
+ offset += num_active;
+ }
+
+ const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+ warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+ }
- ccl_gpu_syncthreads();
+ ccl_gpu_syncthreads();
- /* Write to index array. */
- if (is_active) {
- const uint block_offset = warp_offset[num_warps];
- indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+ /* Write to index array. */
+ if (is_active) {
+ const uint block_offset = warp_offset[num_warps];
+ indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+ }
}
-}
+
+#ifdef __KERNEL_METAL__
+}; /* end class ActiveIndexContext */
+
+/* inject the required thread params into a struct, and redirect to its templated member function
+ */
+# define gpu_parallel_active_index_array \
+ ActiveIndexContext(metal_local_id, \
+ metal_global_id, \
+ metal_local_size, \
+ simdgroup_size, \
+ simd_lane_index, \
+ simd_group_index, \
+ num_simd_groups, \
+ simdgroup_offset) \
+ .active_index_array
+#endif
CCL_NAMESPACE_END