Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorBrecht Van Lommel <brecht@blender.org>2022-02-12 01:58:41 +0300
committerBrecht Van Lommel <brecht@blender.org>2022-02-12 01:58:41 +0300
commit35c261dfcf61bbbf21df68ef82066ec3f0c2b76f (patch)
tree5cd1abfdf7e29bbb12fec937c1650a1188815d59 /intern
parentf3c58c65054fb3b0c0c09ede249c71fb4dd2c62e (diff)
parent27d3140b1363b852f449c81f941974fbd644464a (diff)
Merge branch 'blender-v3.1-release'
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/kernel/device/gpu/kernel.h14
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_active_index.h140
2 files changed, 67 insertions, 87 deletions
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index e44941a1313..5dacf2910be 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -282,7 +282,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
int kernel_index);
ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
- gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
@@ -297,7 +297,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
int kernel_index);
ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
- gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
@@ -309,7 +309,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
{
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
- gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
@@ -322,7 +322,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
{
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
- gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
}
@@ -335,7 +335,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
{
ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
- gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
}
@@ -378,7 +378,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
int num_active_paths);
ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
- gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
@@ -411,7 +411,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
int num_active_paths);
ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
- gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index 33b108f9625..32dbe0ddaa3 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -18,44 +18,26 @@ CCL_NAMESPACE_BEGIN
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
#endif
+#ifndef __KERNEL_METAL__
+template<uint blocksize, typename IsActiveOp>
+__device__
+#endif
+void gpu_parallel_active_index_array_impl(const uint num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
#ifdef __KERNEL_METAL__
-struct ActiveIndexContext {
- ActiveIndexContext(int _thread_index,
- int _global_index,
- int _threadgroup_size,
- int _simdgroup_size,
- int _simd_lane_index,
- int _simd_group_index,
- int _num_simd_groups,
- threadgroup int *_simdgroup_offset)
- : thread_index(_thread_index),
- global_index(_global_index),
- blocksize(_threadgroup_size),
- ccl_gpu_warp_size(_simdgroup_size),
- thread_warp(_simd_lane_index),
- warp_index(_simd_group_index),
- num_warps(_num_simd_groups),
- warp_offset(_simdgroup_offset)
- {
- }
-
- const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
- num_warps;
- threadgroup int *warp_offset;
-
- template<uint blocksizeDummy, typename IsActiveOp>
- void active_index_array(const uint num_states,
- ccl_global int *indices,
- ccl_global int *num_indices,
- IsActiveOp is_active_op)
- {
- const uint state_index = global_index;
+ const uint is_active,
+ const uint blocksize,
+ const int thread_index,
+ const uint state_index,
+ const int ccl_gpu_warp_size,
+ const int thread_warp,
+ const int warp_index,
+ const int num_warps,
+ threadgroup int *warp_offset)
+{
#else
-template<uint blocksize, typename IsActiveOp>
-__device__ void gpu_parallel_active_index_array(const uint num_states,
- ccl_global int *indices,
- ccl_global int *num_indices,
- IsActiveOp is_active_op)
+ IsActiveOp is_active_op)
{
extern ccl_gpu_shared int warp_offset[];
@@ -66,61 +48,59 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
const uint num_warps = blocksize / ccl_gpu_warp_size;
const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
+
+ /* Test if state corresponding to this thread is active. */
+ const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
#endif
- /* Test if state corresponding to this thread is active. */
- const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+ /* For each thread within a warp compute how many other active states precede it. */
+ const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
+ ccl_gpu_thread_mask(thread_warp));
- /* For each thread within a warp compute how many other active states precede it. */
- const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
- ccl_gpu_thread_mask(thread_warp));
+ /* Last thread in warp stores number of active states for each warp. */
+ if (thread_warp == ccl_gpu_warp_size - 1) {
+ warp_offset[warp_index] = thread_offset + is_active;
+ }
- /* Last thread in warp stores number of active states for each warp. */
- if (thread_warp == ccl_gpu_warp_size - 1) {
- warp_offset[warp_index] = thread_offset + is_active;
+ ccl_gpu_syncthreads();
+
+ /* Last thread in block converts per-warp sizes to offsets, increments global size of
+ * index array and gets offset to write to. */
+ if (thread_index == blocksize - 1) {
+ /* TODO: parallelize this. */
+ int offset = 0;
+ for (int i = 0; i < num_warps; i++) {
+ int num_active = warp_offset[i];
+ warp_offset[i] = offset;
+ offset += num_active;
}
- ccl_gpu_syncthreads();
-
- /* Last thread in block converts per-warp sizes to offsets, increments global size of
- * index array and gets offset to write to. */
- if (thread_index == blocksize - 1) {
- /* TODO: parallelize this. */
- int offset = 0;
- for (int i = 0; i < num_warps; i++) {
- int num_active = warp_offset[i];
- warp_offset[i] = offset;
- offset += num_active;
- }
-
- const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
- warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
- }
+ const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+ warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+ }
- ccl_gpu_syncthreads();
+ ccl_gpu_syncthreads();
- /* Write to index array. */
- if (is_active) {
- const uint block_offset = warp_offset[num_warps];
- indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
- }
+ /* Write to index array. */
+ if (is_active) {
+ const uint block_offset = warp_offset[num_warps];
+ indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
}
+}
#ifdef __KERNEL_METAL__
-}; /* end class ActiveIndexContext */
-
-/* inject the required thread params into a struct, and redirect to its templated member function
- */
-# define gpu_parallel_active_index_array \
- ActiveIndexContext(metal_local_id, \
- metal_global_id, \
- metal_local_size, \
- simdgroup_size, \
- simd_lane_index, \
- simd_group_index, \
- num_simd_groups, \
- simdgroup_offset) \
- .active_index_array
+
+# define gpu_parallel_active_index_array(dummy, num_states, indices, num_indices, is_active_op) \
+ const uint is_active = (ccl_gpu_global_id_x() < num_states) ? is_active_op(ccl_gpu_global_id_x()) : 0; \
+ gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active, \
+ metal_local_size, metal_local_id, metal_global_id, simdgroup_size, simd_lane_index, \
+ simd_group_index, num_simd_groups, simdgroup_offset)
+
+#else
+
+# define gpu_parallel_active_index_array(blocksize, num_states, indices, num_indices, is_active_op) \
+ gpu_parallel_active_index_array_impl<blocksize>(num_states, indices, num_indices, is_active_op)
+
#endif
CCL_NAMESPACE_END