From 27d3140b1363b852f449c81f941974fbd644464a Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Thu, 10 Feb 2022 18:03:52 +0000 Subject: Cycles: Fix Metal kernel compilation for AMD GPUs Workaround for a compilation issue preventing kernels compiling for AMD GPUs: Avoid problematic use of templates on Metal by making `gpu_parallel_active_index_array` a wrapper macro, and moving `blocksize` to be a macro parameter. Reviewed By: brecht Differential Revision: https://developer.blender.org/D14081 --- intern/cycles/kernel/device/gpu/kernel.h | 14 +-- .../kernel/device/gpu/parallel_active_index.h | 138 +++++++++------------ 2 files changed, 66 insertions(+), 86 deletions(-) (limited to 'intern') diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index eed005803e2..7ebf8777b91 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -295,7 +295,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) int kernel_index); ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index; - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -310,7 +310,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) int kernel_index); ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index; - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -322,7 +322,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) { ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0); - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -335,7 +335,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) { ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0); - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -348,7 +348,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) { ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0); - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -391,7 +391,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) int num_active_paths); ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths; - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } @@ -424,7 +424,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) int num_active_paths); ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths; - gpu_parallel_active_index_array( + gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index a5320edcb3c..12b93cd77a9 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -31,44 +31,26 @@ CCL_NAMESPACE_BEGIN # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 #endif +#ifndef __KERNEL_METAL__ +template +__device__ +#endif +void gpu_parallel_active_index_array_impl(const uint num_states, + ccl_global int *indices, + ccl_global int *num_indices, #ifdef __KERNEL_METAL__ -struct ActiveIndexContext { - ActiveIndexContext(int _thread_index, - int _global_index, - int _threadgroup_size, - int _simdgroup_size, - int _simd_lane_index, - int _simd_group_index, - int _num_simd_groups, - threadgroup int *_simdgroup_offset) - : thread_index(_thread_index), - global_index(_global_index), - blocksize(_threadgroup_size), - ccl_gpu_warp_size(_simdgroup_size), - thread_warp(_simd_lane_index), - warp_index(_simd_group_index), - num_warps(_num_simd_groups), - warp_offset(_simdgroup_offset) - { - } - - const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index, - num_warps; - threadgroup int *warp_offset; - - template - void active_index_array(const uint num_states, - ccl_global int *indices, - ccl_global int *num_indices, - IsActiveOp is_active_op) - { - const uint state_index = global_index; + const uint is_active, + const uint blocksize, + const int thread_index, + const uint state_index, + const int ccl_gpu_warp_size, + const int thread_warp, + const int warp_index, + const int num_warps, + threadgroup int *warp_offset) +{ #else -template -__device__ void gpu_parallel_active_index_array(const uint num_states, - ccl_global int *indices, - ccl_global int *num_indices, - IsActiveOp is_active_op) + IsActiveOp is_active_op) { extern ccl_gpu_shared int warp_offset[]; @@ -79,61 +61,59 @@ __device__ void gpu_parallel_active_index_array(const uint num_states, const uint num_warps = blocksize / ccl_gpu_warp_size; const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index; + + /* Test if state corresponding to this thread is active. */ + const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; #endif - /* Test if state corresponding to this thread is active. */ - const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; + /* For each thread within a warp compute how many other active states precede it. */ + const uint thread_offset = popcount(ccl_gpu_ballot(is_active) & + ccl_gpu_thread_mask(thread_warp)); - /* For each thread within a warp compute how many other active states precede it. */ - const uint thread_offset = popcount(ccl_gpu_ballot(is_active) & - ccl_gpu_thread_mask(thread_warp)); + /* Last thread in warp stores number of active states for each warp. */ + if (thread_warp == ccl_gpu_warp_size - 1) { + warp_offset[warp_index] = thread_offset + is_active; + } - /* Last thread in warp stores number of active states for each warp. */ - if (thread_warp == ccl_gpu_warp_size - 1) { - warp_offset[warp_index] = thread_offset + is_active; + ccl_gpu_syncthreads(); + + /* Last thread in block converts per-warp sizes to offsets, increments global size of + * index array and gets offset to write to. */ + if (thread_index == blocksize - 1) { + /* TODO: parallelize this. */ + int offset = 0; + for (int i = 0; i < num_warps; i++) { + int num_active = warp_offset[i]; + warp_offset[i] = offset; + offset += num_active; } - ccl_gpu_syncthreads(); - - /* Last thread in block converts per-warp sizes to offsets, increments global size of - * index array and gets offset to write to. */ - if (thread_index == blocksize - 1) { - /* TODO: parallelize this. */ - int offset = 0; - for (int i = 0; i < num_warps; i++) { - int num_active = warp_offset[i]; - warp_offset[i] = offset; - offset += num_active; - } - - const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active; - warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); - } + const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active; + warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); + } - ccl_gpu_syncthreads(); + ccl_gpu_syncthreads(); - /* Write to index array. */ - if (is_active) { - const uint block_offset = warp_offset[num_warps]; - indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index; - } + /* Write to index array. */ + if (is_active) { + const uint block_offset = warp_offset[num_warps]; + indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index; } +} #ifdef __KERNEL_METAL__ -}; /* end class ActiveIndexContext */ -/* inject the required thread params into a struct, and redirect to its templated member function - */ -# define gpu_parallel_active_index_array \ - ActiveIndexContext(metal_local_id, \ - metal_global_id, \ - metal_local_size, \ - simdgroup_size, \ - simd_lane_index, \ - simd_group_index, \ - num_simd_groups, \ - simdgroup_offset) \ - .active_index_array +# define gpu_parallel_active_index_array(dummy, num_states, indices, num_indices, is_active_op) \ + const uint is_active = (ccl_gpu_global_id_x() < num_states) ? is_active_op(ccl_gpu_global_id_x()) : 0; \ + gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active, \ + metal_local_size, metal_local_id, metal_global_id, simdgroup_size, simd_lane_index, \ + simd_group_index, num_simd_groups, simdgroup_offset) + +#else + +# define gpu_parallel_active_index_array(blocksize, num_states, indices, num_indices, is_active_op) \ + gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op) + #endif CCL_NAMESPACE_END -- cgit v1.2.3