Cycles: Fix Metal kernel compilation for AMD GPUs

Workaround for a compilation issue preventing kernels compiling for AMD GPUs: Avoid problematic use of templates on Metal by making `gpu_parallel_active_index_array` a wrapper macro, and moving `blocksize` to be a macro parameter. Reviewed By: brecht Differential Revision: https://developer.blender.org/D14081
author: Michael Jones <michael_p_jones@apple.com> 2022-02-10 21:03:52 +0300
committer: Michael Jones <michael_p_jones@apple.com> 2022-02-12 01:52:48 +0300
commit: 27d3140b1363b852f449c81f941974fbd644464a (patch)
tree: 9028df35c15a261acf271e094204c3298fe0db03 /intern
parent: 40fce61a6abe79508022d3e0cd3a29e187f18e74 (diff)
2 files changed, 66 insertions, 86 deletions
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index eed005803e2..7ebf8777b91 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -295,7 +295,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                         int kernel_index);
   ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
 
-  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
       num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
@@ -310,7 +310,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                         int kernel_index);
   ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
 
-  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
       num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
@@ -322,7 +322,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
 
-  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
       num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
@@ -335,7 +335,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
 
-  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
       num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
@@ -348,7 +348,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
 
-  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
       num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
@@ -391,7 +391,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                         int num_active_paths);
   ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
 
-  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
       num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
@@ -424,7 +424,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                         int num_active_paths);
   ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
 
-  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
       num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index a5320edcb3c..12b93cd77a9 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -31,44 +31,26 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 
+#ifndef __KERNEL_METAL__
+template<uint blocksize, typename IsActiveOp>
+__device__
+#endif
+void gpu_parallel_active_index_array_impl(const uint num_states,
+                                          ccl_global int *indices,
+                                          ccl_global int *num_indices,
 #ifdef __KERNEL_METAL__
-struct ActiveIndexContext {
-  ActiveIndexContext(int _thread_index,
-                     int _global_index,
-                     int _threadgroup_size,
-                     int _simdgroup_size,
-                     int _simd_lane_index,
-                     int _simd_group_index,
-                     int _num_simd_groups,
-                     threadgroup int *_simdgroup_offset)
-      : thread_index(_thread_index),
-        global_index(_global_index),
-        blocksize(_threadgroup_size),
-        ccl_gpu_warp_size(_simdgroup_size),
-        thread_warp(_simd_lane_index),
-        warp_index(_simd_group_index),
-        num_warps(_num_simd_groups),
-        warp_offset(_simdgroup_offset)
-  {
-  }
-
-  const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
-      num_warps;
-  threadgroup int *warp_offset;
-
-  template<uint blocksizeDummy, typename IsActiveOp>
-  void active_index_array(const uint num_states,
-                          ccl_global int *indices,
-                          ccl_global int *num_indices,
-                          IsActiveOp is_active_op)
-  {
-    const uint state_index = global_index;
+                                          const uint is_active,
+                                          const uint blocksize,
+                                          const int thread_index,
+                                          const uint state_index,
+                                          const int ccl_gpu_warp_size,
+                                          const int thread_warp,
+                                          const int warp_index,
+                                          const int num_warps,
+                                          threadgroup int *warp_offset)
+{
 #else
-template<uint blocksize, typename IsActiveOp>
-__device__ void gpu_parallel_active_index_array(const uint num_states,
-                                                ccl_global int *indices,
-                                                ccl_global int *num_indices,
-                                                IsActiveOp is_active_op)
+                                          IsActiveOp is_active_op)
 {
   extern ccl_gpu_shared int warp_offset[];
 
@@ -79,61 +61,59 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
   const uint num_warps = blocksize / ccl_gpu_warp_size;
 
   const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
+
+  /* Test if state corresponding to this thread is active. */
+  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
 #endif
 
-    /* Test if state corresponding to this thread is active. */
-    const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+  /* For each thread within a warp compute how many other active states precede it. */
+  const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
+                                      ccl_gpu_thread_mask(thread_warp));
 
-    /* For each thread within a warp compute how many other active states precede it. */
-    const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
-                                        ccl_gpu_thread_mask(thread_warp));
+  /* Last thread in warp stores number of active states for each warp. */
+  if (thread_warp == ccl_gpu_warp_size - 1) {
+    warp_offset[warp_index] = thread_offset + is_active;
+  }
 
-    /* Last thread in warp stores number of active states for each warp. */
-    if (thread_warp == ccl_gpu_warp_size - 1) {
-      warp_offset[warp_index] = thread_offset + is_active;
+  ccl_gpu_syncthreads();
+
+  /* Last thread in block converts per-warp sizes to offsets, increments global size of
+    * index array and gets offset to write to. */
+  if (thread_index == blocksize - 1) {
+    /* TODO: parallelize this. */
+    int offset = 0;
+    for (int i = 0; i < num_warps; i++) {
+      int num_active = warp_offset[i];
+      warp_offset[i] = offset;
+      offset += num_active;
     }
 
-    ccl_gpu_syncthreads();
-
-    /* Last thread in block converts per-warp sizes to offsets, increments global size of
-     * index array and gets offset to write to. */
-    if (thread_index == blocksize - 1) {
-      /* TODO: parallelize this. */
-      int offset = 0;
-      for (int i = 0; i < num_warps; i++) {
-        int num_active = warp_offset[i];
-        warp_offset[i] = offset;
-        offset += num_active;
-      }
-
-      const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
-      warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
-    }
+    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+  }
 
-    ccl_gpu_syncthreads();
+  ccl_gpu_syncthreads();
 
-    /* Write to index array. */
-    if (is_active) {
-      const uint block_offset = warp_offset[num_warps];
-      indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
-    }
+  /* Write to index array. */
+  if (is_active) {
+    const uint block_offset = warp_offset[num_warps];
+    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
   }
+}
 
 #ifdef __KERNEL_METAL__
-}; /* end class ActiveIndexContext */
 
-/* inject the required thread params into a struct, and redirect to its templated member function
- */
-#  define gpu_parallel_active_index_array \
-    ActiveIndexContext(metal_local_id, \
-                       metal_global_id, \
-                       metal_local_size, \
-                       simdgroup_size, \
-                       simd_lane_index, \
-                       simd_group_index, \
-                       num_simd_groups, \
-                       simdgroup_offset) \
-        .active_index_array
+#  define gpu_parallel_active_index_array(dummy, num_states, indices, num_indices, is_active_op) \
+  const uint is_active = (ccl_gpu_global_id_x() < num_states) ? is_active_op(ccl_gpu_global_id_x()) : 0; \
+  gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active, \
+    metal_local_size, metal_local_id, metal_global_id, simdgroup_size, simd_lane_index, \
+    simd_group_index, num_simd_groups, simdgroup_offset)
+
+#else
+
+#  define gpu_parallel_active_index_array(blocksize, num_states, indices, num_indices, is_active_op) \
+  gpu_parallel_active_index_array_impl<blocksize>(num_states, indices, num_indices, is_active_op)
+
 #endif
 
 CCL_NAMESPACE_END
author	Michael Jones <michael_p_jones@apple.com>	2022-02-10 21:03:52 +0300
committer	Michael Jones <michael_p_jones@apple.com>	2022-02-12 01:52:48 +0300
commit	27d3140b1363b852f449c81f941974fbd644464a (patch)
tree	9028df35c15a261acf271e094204c3298fe0db03 /intern
parent	40fce61a6abe79508022d3e0cd3a29e187f18e74 (diff)