diff options
Diffstat (limited to 'intern/cycles/kernel/device/gpu/parallel_active_index.h')
-rw-r--r-- | intern/cycles/kernel/device/gpu/parallel_active_index.h | 83 |
1 files changed, 83 insertions, 0 deletions
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h new file mode 100644 index 00000000000..85500bf4d07 --- /dev/null +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -0,0 +1,83 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* Given an array of states, build an array of indices for which the states + * are active. + * + * Shared memory requirement is sizeof(int) * (number_of_warps + 1) */ + +#include "util/util_atomic.h" + +#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 + +template<uint blocksize, typename IsActiveOp> +__device__ void gpu_parallel_active_index_array(const uint num_states, + int *indices, + int *num_indices, + IsActiveOp is_active_op) +{ + extern ccl_gpu_shared int warp_offset[]; + + const uint thread_index = ccl_gpu_thread_idx_x; + const uint thread_warp = thread_index % ccl_gpu_warp_size; + + const uint warp_index = thread_index / ccl_gpu_warp_size; + const uint num_warps = blocksize / ccl_gpu_warp_size; + + /* Test if state corresponding to this thread is active. */ + const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index; + const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; + + /* For each thread within a warp compute how many other active states precede it. */ + const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp); + const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask); + + /* Last thread in warp stores number of active states for each warp. */ + if (thread_warp == ccl_gpu_warp_size - 1) { + warp_offset[warp_index] = thread_offset + is_active; + } + + ccl_gpu_syncthreads(); + + /* Last thread in block converts per-warp sizes to offsets, increments global size of + * index array and gets offset to write to. */ + if (thread_index == blocksize - 1) { + /* TODO: parallelize this. */ + int offset = 0; + for (int i = 0; i < num_warps; i++) { + int num_active = warp_offset[i]; + warp_offset[i] = offset; + offset += num_active; + } + + const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active; + warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); + } + + ccl_gpu_syncthreads(); + + /* Write to index array. */ + if (is_active) { + const uint block_offset = warp_offset[num_warps]; + indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index; + } +} + +CCL_NAMESPACE_END |