Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorMichael Jones <michael_p_jones@apple.com>2022-02-10 21:03:52 +0300
committerMichael Jones <michael_p_jones@apple.com>2022-02-12 01:52:48 +0300
commit27d3140b1363b852f449c81f941974fbd644464a (patch)
tree9028df35c15a261acf271e094204c3298fe0db03 /intern
parent40fce61a6abe79508022d3e0cd3a29e187f18e74 (diff)
Cycles: Fix Metal kernel compilation for AMD GPUs
Workaround for a compilation issue preventing kernels compiling for AMD GPUs: Avoid problematic use of templates on Metal by making `gpu_parallel_active_index_array` a wrapper macro, and moving `blocksize` to be a macro parameter. Reviewed By: brecht Differential Revision: https://developer.blender.org/D14081
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/kernel/device/gpu/kernel.h14
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_active_index.h138
2 files changed, 66 insertions, 86 deletions
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index eed005803e2..7ebf8777b91 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -295,7 +295,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
int kernel_index);
ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
- gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
@@ -310,7 +310,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
int kernel_index);
ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
- gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
@@ -322,7 +322,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
{
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
- gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
@@ -335,7 +335,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
{
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
- gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
}
@@ -348,7 +348,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
{
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
- gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
}
@@ -391,7 +391,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
int num_active_paths);
ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
- gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
@@ -424,7 +424,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
int num_active_paths);
ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
- gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index a5320edcb3c..12b93cd77a9 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -31,44 +31,26 @@ CCL_NAMESPACE_BEGIN
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
#endif
+#ifndef __KERNEL_METAL__
+template<uint blocksize, typename IsActiveOp>
+__device__
+#endif
+void gpu_parallel_active_index_array_impl(const uint num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
#ifdef __KERNEL_METAL__
-struct ActiveIndexContext {
- ActiveIndexContext(int _thread_index,
- int _global_index,
- int _threadgroup_size,
- int _simdgroup_size,
- int _simd_lane_index,
- int _simd_group_index,
- int _num_simd_groups,
- threadgroup int *_simdgroup_offset)
- : thread_index(_thread_index),
- global_index(_global_index),
- blocksize(_threadgroup_size),
- ccl_gpu_warp_size(_simdgroup_size),
- thread_warp(_simd_lane_index),
- warp_index(_simd_group_index),
- num_warps(_num_simd_groups),
- warp_offset(_simdgroup_offset)
- {
- }
-
- const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
- num_warps;
- threadgroup int *warp_offset;
-
- template<uint blocksizeDummy, typename IsActiveOp>
- void active_index_array(const uint num_states,
- ccl_global int *indices,
- ccl_global int *num_indices,
- IsActiveOp is_active_op)
- {
- const uint state_index = global_index;
+ const uint is_active,
+ const uint blocksize,
+ const int thread_index,
+ const uint state_index,
+ const int ccl_gpu_warp_size,
+ const int thread_warp,
+ const int warp_index,
+ const int num_warps,
+ threadgroup int *warp_offset)
+{
#else
-template<uint blocksize, typename IsActiveOp>
-__device__ void gpu_parallel_active_index_array(const uint num_states,
- ccl_global int *indices,
- ccl_global int *num_indices,
- IsActiveOp is_active_op)
+ IsActiveOp is_active_op)
{
extern ccl_gpu_shared int warp_offset[];
@@ -79,61 +61,59 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
const uint num_warps = blocksize / ccl_gpu_warp_size;
const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
+
+ /* Test if state corresponding to this thread is active. */
+ const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
#endif
- /* Test if state corresponding to this thread is active. */
- const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+ /* For each thread within a warp compute how many other active states precede it. */
+ const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
+ ccl_gpu_thread_mask(thread_warp));
- /* For each thread within a warp compute how many other active states precede it. */
- const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
- ccl_gpu_thread_mask(thread_warp));
+ /* Last thread in warp stores number of active states for each warp. */
+ if (thread_warp == ccl_gpu_warp_size - 1) {
+ warp_offset[warp_index] = thread_offset + is_active;
+ }
- /* Last thread in warp stores number of active states for each warp. */
- if (thread_warp == ccl_gpu_warp_size - 1) {
- warp_offset[warp_index] = thread_offset + is_active;
+ ccl_gpu_syncthreads();
+
+ /* Last thread in block converts per-warp sizes to offsets, increments global size of
+ * index array and gets offset to write to. */
+ if (thread_index == blocksize - 1) {
+ /* TODO: parallelize this. */
+ int offset = 0;
+ for (int i = 0; i < num_warps; i++) {
+ int num_active = warp_offset[i];
+ warp_offset[i] = offset;
+ offset += num_active;
}
- ccl_gpu_syncthreads();
-
- /* Last thread in block converts per-warp sizes to offsets, increments global size of
- * index array and gets offset to write to. */
- if (thread_index == blocksize - 1) {
- /* TODO: parallelize this. */
- int offset = 0;
- for (int i = 0; i < num_warps; i++) {
- int num_active = warp_offset[i];
- warp_offset[i] = offset;
- offset += num_active;
- }
-
- const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
- warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
- }
+ const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+ warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+ }
- ccl_gpu_syncthreads();
+ ccl_gpu_syncthreads();
- /* Write to index array. */
- if (is_active) {
- const uint block_offset = warp_offset[num_warps];
- indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
- }
+ /* Write to index array. */
+ if (is_active) {
+ const uint block_offset = warp_offset[num_warps];
+ indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
}
+}
#ifdef __KERNEL_METAL__
-}; /* end class ActiveIndexContext */
-/* inject the required thread params into a struct, and redirect to its templated member function
- */
-# define gpu_parallel_active_index_array \
- ActiveIndexContext(metal_local_id, \
- metal_global_id, \
- metal_local_size, \
- simdgroup_size, \
- simd_lane_index, \
- simd_group_index, \
- num_simd_groups, \
- simdgroup_offset) \
- .active_index_array
+# define gpu_parallel_active_index_array(dummy, num_states, indices, num_indices, is_active_op) \
+ const uint is_active = (ccl_gpu_global_id_x() < num_states) ? is_active_op(ccl_gpu_global_id_x()) : 0; \
+ gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active, \
+ metal_local_size, metal_local_id, metal_global_id, simdgroup_size, simd_lane_index, \
+ simd_group_index, num_simd_groups, simdgroup_offset)
+
+#else
+
+# define gpu_parallel_active_index_array(blocksize, num_states, indices, num_indices, is_active_op) \
+ gpu_parallel_active_index_array_impl<blocksize>(num_states, indices, num_indices, is_active_op)
+
#endif
CCL_NAMESPACE_END