1 files changed, 83 insertions, 31 deletions
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index d7416beb783..f667ede2712 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -31,10 +31,43 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 
+#ifdef __KERNEL_METAL__
+struct ActiveIndexContext {
+  ActiveIndexContext(int _thread_index,
+                     int _global_index,
+                     int _threadgroup_size,
+                     int _simdgroup_size,
+                     int _simd_lane_index,
+                     int _simd_group_index,
+                     int _num_simd_groups,
+                     threadgroup int *_simdgroup_offset)
+      : thread_index(_thread_index),
+        global_index(_global_index),
+        blocksize(_threadgroup_size),
+        ccl_gpu_warp_size(_simdgroup_size),
+        thread_warp(_simd_lane_index),
+        warp_index(_simd_group_index),
+        num_warps(_num_simd_groups),
+        warp_offset(_simdgroup_offset)
+  {
+  }
+
+  const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
+      num_warps;
+  threadgroup int *warp_offset;
+
+  template<uint blocksizeDummy, typename IsActiveOp>
+  void active_index_array(const uint num_states,
+                          ccl_global int *indices,
+                          ccl_global int *num_indices,
+                          IsActiveOp is_active_op)
+  {
+    const uint state_index = global_index;
+#else
 template<uint blocksize, typename IsActiveOp>
 __device__ void gpu_parallel_active_index_array(const uint num_states,
-                                                int *indices,
-                                                int *num_indices,
+                                                ccl_global int *indices,
+                                                ccl_global int *num_indices,
                                                 IsActiveOp is_active_op)
 {
   extern ccl_gpu_shared int warp_offset[];
@@ -45,43 +78,62 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
   const uint warp_index = thread_index / ccl_gpu_warp_size;
   const uint num_warps = blocksize / ccl_gpu_warp_size;
 
-  /* Test if state corresponding to this thread is active. */
   const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
-  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#endif
 
-  /* For each thread within a warp compute how many other active states precede it. */
-  const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
-  const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+    /* Test if state corresponding to this thread is active. */
+    const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
 
-  /* Last thread in warp stores number of active states for each warp. */
-  if (thread_warp == ccl_gpu_warp_size - 1) {
-    warp_offset[warp_index] = thread_offset + is_active;
-  }
+    /* For each thread within a warp compute how many other active states precede it. */
+    const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) &
+                                            ccl_gpu_thread_mask(thread_warp));
 
-  ccl_gpu_syncthreads();
-
-  /* Last thread in block converts per-warp sizes to offsets, increments global size of
-   * index array and gets offset to write to. */
-  if (thread_index == blocksize - 1) {
-    /* TODO: parallelize this. */
-    int offset = 0;
-    for (int i = 0; i < num_warps; i++) {
-      int num_active = warp_offset[i];
-      warp_offset[i] = offset;
-      offset += num_active;
+    /* Last thread in warp stores number of active states for each warp. */
+    if (thread_warp == ccl_gpu_warp_size - 1) {
+      warp_offset[warp_index] = thread_offset + is_active;
     }
 
-    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
-    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
-  }
+    ccl_gpu_syncthreads();
+
+    /* Last thread in block converts per-warp sizes to offsets, increments global size of
+     * index array and gets offset to write to. */
+    if (thread_index == blocksize - 1) {
+      /* TODO: parallelize this. */
+      int offset = 0;
+      for (int i = 0; i < num_warps; i++) {
+        int num_active = warp_offset[i];
+        warp_offset[i] = offset;
+        offset += num_active;
+      }
+
+      const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+      warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+    }
 
-  ccl_gpu_syncthreads();
+    ccl_gpu_syncthreads();
 
-  /* Write to index array. */
-  if (is_active) {
-    const uint block_offset = warp_offset[num_warps];
-    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+    /* Write to index array. */
+    if (is_active) {
+      const uint block_offset = warp_offset[num_warps];
+      indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+    }
   }
-}
+
+#ifdef __KERNEL_METAL__
+}; /* end class ActiveIndexContext */
+
+/* inject the required thread params into a struct, and redirect to its templated member function
+ */
+#  define gpu_parallel_active_index_array \
+    ActiveIndexContext(metal_local_id, \
+                       metal_global_id, \
+                       metal_local_size, \
+                       simdgroup_size, \
+                       simd_lane_index, \
+                       simd_group_index, \
+                       num_simd_groups, \
+                       simdgroup_offset) \
+        .active_index_array
+#endif
 
 CCL_NAMESPACE_END