Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel/device/gpu/kernel.h')
-rw-r--r--intern/cycles/kernel/device/gpu/kernel.h837
1 files changed, 403 insertions, 434 deletions
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 5848ba5df9d..2ec6a49ec7b 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -21,6 +21,10 @@
#include "kernel/device/gpu/parallel_sorted_index.h"
#include "kernel/device/gpu/work_stealing.h"
+#ifdef __KERNEL_METAL__
+# include "kernel/device/metal/context_begin.h"
+#endif
+
#include "kernel/integrator/state.h"
#include "kernel/integrator/state_flow.h"
#include "kernel/integrator/state_util.h"
@@ -40,6 +44,11 @@
#include "kernel/bake/bake.h"
#include "kernel/film/adaptive_sampling.h"
+
+#ifdef __KERNEL_METAL__
+# include "kernel/device/metal/context_end.h"
+#endif
+
#include "kernel/film/read.h"
/* --------------------------------------------------------------------
@@ -47,7 +56,8 @@
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_reset(int num_states)
+ccl_gpu_kernel_signature(integrator_reset,
+ int num_states)
{
const int state = ccl_gpu_global_id_x();
@@ -58,10 +68,11 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
- const int num_tiles,
- float *render_buffer,
- const int max_tile_work_size)
+ ccl_gpu_kernel_signature(integrator_init_from_camera,
+ ccl_global KernelWorkTile *tiles,
+ const int num_tiles,
+ ccl_global float *render_buffer,
+ const int max_tile_work_size)
{
const int work_index = ccl_gpu_global_id_x();
@@ -72,7 +83,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int tile_index = work_index / max_tile_work_size;
const int tile_work_index = work_index - tile_index * max_tile_work_size;
- const KernelWorkTile *tile = &tiles[tile_index];
+ ccl_global const KernelWorkTile *tile = &tiles[tile_index];
if (tile_work_index >= tile->work_size) {
return;
@@ -83,14 +94,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
uint x, y, sample;
get_work_pixel(tile, tile_work_index, &x, &y, &sample);
- integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+ ccl_gpu_kernel_call(
+ integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample));
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
- const int num_tiles,
- float *render_buffer,
- const int max_tile_work_size)
+ ccl_gpu_kernel_signature(integrator_init_from_bake,
+ ccl_global KernelWorkTile *tiles,
+ const int num_tiles,
+ ccl_global float *render_buffer,
+ const int max_tile_work_size)
{
const int work_index = ccl_gpu_global_id_x();
@@ -101,7 +114,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int tile_index = work_index / max_tile_work_size;
const int tile_work_index = work_index - tile_index * max_tile_work_size;
- const KernelWorkTile *tile = &tiles[tile_index];
+ ccl_global const KernelWorkTile *tile = &tiles[tile_index];
if (tile_work_index >= tile->work_size) {
return;
@@ -112,228 +125,260 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
uint x, y, sample;
get_work_pixel(tile, tile_work_index, &x, &y, &sample);
- integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+ ccl_gpu_kernel_call(
+ integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample));
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_closest,
+ ccl_global const int *path_index_array,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_closest(NULL, state);
+ ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_shadow,
+ ccl_global const int *path_index_array,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_shadow(NULL, state);
+ ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_subsurface,
+ ccl_global const int *path_index_array,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_subsurface(NULL, state);
+ ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_volume_stack,
+ ccl_global const int *path_index_array,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_volume_stack(NULL, state);
+ ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_background(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_background,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_background(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_light(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_light,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_light(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_shadow(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_shadow,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_shadow(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_surface(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_surface,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_surface(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_surface_raytrace(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_volume(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_volume,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_volume(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer));
}
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_queued_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int kernel)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_queued_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int kernel_index)
{
- gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [kernel](const int state) {
- return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel);
- });
-}
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
+ int kernel_index)
+ .kernel_index = kernel_index;
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int kernel)
-{
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [kernel](const int state) {
- return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int kernel_index)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
+ int kernel_index)
+ .kernel_index = kernel_index;
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [](const int state) {
- return (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_terminated_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int indices_offset)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_active_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices + indices_offset, num_indices, [](const int state) {
- return (INTEGRATOR_STATE(state, path, queued_kernel) == 0);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_terminated_shadow_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int indices_offset)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_terminated_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int indices_offset)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices + indices_offset, num_indices, [](const int state) {
- return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
- });
+ num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_sorted_paths_array(int num_states,
- int num_states_limit,
- int *indices,
- int *num_indices,
- int *key_counter,
- int *key_prefix_sum,
- int kernel)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int indices_offset)
{
- gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_sorted_paths_array,
+ int num_states,
+ int num_states_limit,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ ccl_global int *key_counter,
+ ccl_global int *key_prefix_sum,
+ int kernel_index)
+{
+ ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
+ INTEGRATOR_STATE(state, path, shader_sort_key) :
+ GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY,
+ int kernel_index)
+ .kernel_index = kernel_index;
+
+ const uint state_index = ccl_gpu_global_id_x();
+ gpu_parallel_sorted_index_array(
+ state_index,
num_states,
num_states_limit,
indices,
- num_indices,
- key_counter,
- key_prefix_sum,
- [kernel](const int state) {
- return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
- INTEGRATOR_STATE(state, path, shader_sort_key) :
- GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
- });
-}
-
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_compact_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int num_active_paths)
-{
+ num_indices,
+ key_counter,
+ key_prefix_sum,
+ ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int num_active_paths)
+{
+ ccl_gpu_kernel_lambda((state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
+ int num_active_paths)
+ .num_active_paths = num_active_paths;
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [num_active_paths](const int state) {
- return (state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_compact_states(const int *active_terminated_states,
- const int active_states_offset,
- const int terminated_states_offset,
- const int work_size)
+ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_states,
+ ccl_global const int *active_terminated_states,
+ const int active_states_offset,
+ const int terminated_states_offset,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
@@ -341,28 +386,31 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
const int from_state = active_terminated_states[active_states_offset + global_index];
const int to_state = active_terminated_states[terminated_states_offset + global_index];
- integrator_state_move(NULL, to_state, from_state);
+ ccl_gpu_kernel_call(integrator_state_move(NULL, to_state, from_state));
}
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_compact_shadow_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int num_active_paths)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int num_active_paths)
{
+ ccl_gpu_kernel_lambda((state >= num_active_paths) && (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
+ int num_active_paths)
+ .num_active_paths = num_active_paths;
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [num_active_paths](const int state) {
- return (state >= num_active_paths) &&
- (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states,
- const int active_states_offset,
- const int terminated_states_offset,
- const int work_size)
+ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_shadow_states,
+ ccl_global const int *active_terminated_states,
+ const int active_states_offset,
+ const int terminated_states_offset,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
@@ -370,15 +418,14 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
const int from_state = active_terminated_states[active_states_offset + global_index];
const int to_state = active_terminated_states[terminated_states_offset + global_index];
- integrator_shadow_state_move(NULL, to_state, from_state);
+ ccl_gpu_kernel_call(integrator_shadow_state_move(NULL, to_state, from_state));
}
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
- kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
+ccl_gpu_kernel(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature(
+ prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values)
{
- gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(
- counter, prefix_sum, num_values);
+ gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values);
}
/* --------------------------------------------------------------------
@@ -386,16 +433,17 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLO
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
- int sx,
- int sy,
- int sw,
- int sh,
- float threshold,
- bool reset,
- int offset,
- int stride,
- uint *num_active_pixels)
+ ccl_gpu_kernel_signature(adaptive_sampling_convergence_check,
+ ccl_global float *render_buffer,
+ int sx,
+ int sy,
+ int sw,
+ int sh,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride,
+ ccl_global uint *num_active_pixels)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / sw;
@@ -404,37 +452,51 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
bool converged = true;
if (x < sw && y < sh) {
- converged = kernel_adaptive_sampling_convergence_check(
- nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+ converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check(
+ nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
}
/* NOTE: All threads specified in the mask must execute the intrinsic. */
- const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+ const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
if (lane_id == 0) {
- atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+ atomic_fetch_and_add_uint32(num_active_pixels, ccl_gpu_popc(num_active_pixels_mask));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_adaptive_sampling_filter_x(
- float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+ ccl_gpu_kernel_signature(adaptive_sampling_filter_x,
+ ccl_global float *render_buffer,
+ int sx,
+ int sy,
+ int sw,
+ int sh,
+ int offset,
+ int stride)
{
const int y = ccl_gpu_global_id_x();
if (y < sh) {
- kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+ ccl_gpu_kernel_call(
+ kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_adaptive_sampling_filter_y(
- float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+ ccl_gpu_kernel_signature(adaptive_sampling_filter_y,
+ ccl_global float *render_buffer,
+ int sx,
+ int sy,
+ int sw,
+ int sh,
+ int offset,
+ int stride)
{
const int x = ccl_gpu_global_id_x();
if (x < sw) {
- kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+ ccl_gpu_kernel_call(
+ kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
}
}
@@ -443,12 +505,14 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+ ccl_gpu_kernel_signature(cryptomatte_postprocess,
+ ccl_global float *render_buffer,
+ int num_pixels)
{
const int pixel_index = ccl_gpu_global_id_x();
if (pixel_index < num_pixels) {
- kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+ ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index));
}
}
@@ -456,206 +520,102 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
* Film.
*/
-/* Common implementation for float destination. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
- float *pixels,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int dst_offset,
- int dst_stride,
- const Processor &processor)
-{
- const int render_pixel_index = ccl_gpu_global_id_x();
- if (render_pixel_index >= num_pixels) {
- return;
- }
-
- const int x = render_pixel_index % width;
- const int y = render_pixel_index / width;
-
- ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
- y * stride * kfilm_convert->pass_stride;
-
- ccl_global float *pixel = pixels +
- (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
-
- processor(kfilm_convert, buffer, pixel);
-}
-
-/* Common implementation for half4 destination and 4-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
- const KernelFilmConvert *kfilm_convert,
- uchar4 *rgba,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int rgba_offset,
- int rgba_stride,
- const Processor &processor)
-{
- const int render_pixel_index = ccl_gpu_global_id_x();
- if (render_pixel_index >= num_pixels) {
- return;
- }
-
- const int x = render_pixel_index % width;
- const int y = render_pixel_index / width;
-
- ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
- y * stride * kfilm_convert->pass_stride;
-
- float pixel[4];
- processor(kfilm_convert, buffer, pixel);
-
- film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
-
- ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
- *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
-}
-
-/* Common implementation for half4 destination and 3-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
- const KernelFilmConvert *kfilm_convert,
- uchar4 *rgba,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int rgba_offset,
- int rgba_stride,
- const Processor &processor)
-{
- kernel_gpu_film_convert_half_rgba_common_rgba(
- kfilm_convert,
- rgba,
- render_buffer,
- num_pixels,
- width,
- offset,
- stride,
- rgba_offset,
- rgba_stride,
- [&processor](const KernelFilmConvert *kfilm_convert,
- ccl_global const float *buffer,
- float *pixel_rgba) {
- processor(kfilm_convert, buffer, pixel_rgba);
- pixel_rgba[3] = 1.0f;
- });
-}
-
-/* Common implementation for half4 destination and single channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
- const KernelFilmConvert *kfilm_convert,
- uchar4 *rgba,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int rgba_offset,
- int rgba_stride,
- const Processor &processor)
-{
- kernel_gpu_film_convert_half_rgba_common_rgba(
- kfilm_convert,
- rgba,
- render_buffer,
- num_pixels,
- width,
- offset,
- stride,
- rgba_offset,
- rgba_stride,
- [&processor](const KernelFilmConvert *kfilm_convert,
- ccl_global const float *buffer,
- float *pixel_rgba) {
- float value;
- processor(kfilm_convert, buffer, &value);
-
- pixel_rgba[0] = value;
- pixel_rgba[1] = value;
- pixel_rgba[2] = value;
- pixel_rgba[3] = 1.0f;
- });
-}
-
-#define KERNEL_FILM_CONVERT_PROC(name) \
- ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
-
-#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
- KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
- (const KernelFilmConvert kfilm_convert, \
- float *pixels, \
- float *render_buffer, \
- int num_pixels, \
- int width, \
- int offset, \
- int stride, \
- int rgba_offset, \
- int rgba_stride) \
+#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
+ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+ ccl_gpu_kernel_signature(film_convert_##variant, \
+ const KernelFilmConvert kfilm_convert, \
+ ccl_global uchar4 *rgba, \
+ ccl_global float *render_buffer, \
+ int num_pixels, \
+ int width, \
+ int offset, \
+ int stride, \
+ int rgba_offset, \
+ int rgba_stride) \
{ \
- kernel_gpu_film_convert_common(&kfilm_convert, \
- pixels, \
- render_buffer, \
- num_pixels, \
- width, \
- offset, \
- stride, \
- rgba_offset, \
- rgba_stride, \
- film_get_pass_pixel_##variant); \
+ const int render_pixel_index = ccl_gpu_global_id_x(); \
+ if (render_pixel_index >= num_pixels) { \
+ return; \
+ } \
+\
+ const int x = render_pixel_index % width; \
+ const int y = render_pixel_index / width; \
+\
+ ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+ y * stride * kfilm_convert.pass_stride; \
+\
+ float pixel[4]; \
+ film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
+\
+ film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+\
+ if (input_channel_count == 1) { \
+ pixel[1] = pixel[2] = pixel[0]; \
+ } \
+ if (input_channel_count <= 3) { \
+ pixel[3] = 1.0f; \
+ } \
+\
+ ccl_global float *out = ((ccl_global float *)rgba) + rgba_offset + y * rgba_stride + x; \
+ *(ccl_global float4 *)out = make_float4(pixel[0], pixel[1], pixel[2], pixel[3]); \
} \
- KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
- (const KernelFilmConvert kfilm_convert, \
- uchar4 *rgba, \
- float *render_buffer, \
- int num_pixels, \
- int width, \
- int offset, \
- int stride, \
- int rgba_offset, \
- int rgba_stride) \
+\
+ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+ ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
+ const KernelFilmConvert kfilm_convert, \
+ ccl_global uchar4 *rgba, \
+ ccl_global float *render_buffer, \
+ int num_pixels, \
+ int width, \
+ int offset, \
+ int stride, \
+ int rgba_offset, \
+ int rgba_stride) \
{ \
- kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
- rgba, \
- render_buffer, \
- num_pixels, \
- width, \
- offset, \
- stride, \
- rgba_offset, \
- rgba_stride, \
- film_get_pass_pixel_##variant); \
- }
-
-KERNEL_FILM_CONVERT_DEFINE(depth, value)
-KERNEL_FILM_CONVERT_DEFINE(mist, value)
-KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
-KERNEL_FILM_CONVERT_DEFINE(float, value)
-
-KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
-KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
-
-KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
-KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
-KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
-KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
-
-#undef KERNEL_FILM_CONVERT_DEFINE
-#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
-#undef KERNEL_FILM_CONVERT_PROC
+ const int render_pixel_index = ccl_gpu_global_id_x(); \
+ if (render_pixel_index >= num_pixels) { \
+ return; \
+ } \
+\
+ const int x = render_pixel_index % width; \
+ const int y = render_pixel_index / width; \
+\
+ ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+ y * stride * kfilm_convert.pass_stride; \
+\
+ float pixel[4]; \
+ film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
+\
+ film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+\
+ if (input_channel_count == 1) { \
+ pixel[1] = pixel[2] = pixel[0]; \
+ } \
+ if (input_channel_count <= 3) { \
+ pixel[3] = 1.0f; \
+ } \
+\
+ ccl_global half4 *out = ((ccl_global half4 *)rgba) + (rgba_offset + y * rgba_stride + x); \
+ *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
+ }
+
+/* 1 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(depth, 1)
+KERNEL_FILM_CONVERT_VARIANT(mist, 1)
+KERNEL_FILM_CONVERT_VARIANT(sample_count, 1)
+KERNEL_FILM_CONVERT_VARIANT(float, 1)
+
+/* 3 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
+KERNEL_FILM_CONVERT_VARIANT(float3, 3)
+
+/* 4 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(motion, 4)
+KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
+KERNEL_FILM_CONVERT_VARIANT(combined, 4)
+KERNEL_FILM_CONVERT_VARIANT(float4, 4)
/* --------------------------------------------------------------------
* Shader evaluation.
@@ -664,42 +624,46 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
/* Displacement */
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
- float *output,
- const int offset,
- const int work_size)
+ ccl_gpu_kernel_signature(shader_eval_displace,
+ ccl_global KernelShaderEvalInput *input,
+ ccl_global float *output,
+ const int offset,
+ const int work_size)
{
int i = ccl_gpu_global_id_x();
if (i < work_size) {
- kernel_displace_evaluate(NULL, input, output, offset + i);
+ ccl_gpu_kernel_call(kernel_displace_evaluate(NULL, input, output, offset + i));
}
}
/* Background */
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
- float *output,
- const int offset,
- const int work_size)
+ ccl_gpu_kernel_signature(shader_eval_background,
+ ccl_global KernelShaderEvalInput *input,
+ ccl_global float *output,
+ const int offset,
+ const int work_size)
{
int i = ccl_gpu_global_id_x();
if (i < work_size) {
- kernel_background_evaluate(NULL, input, output, offset + i);
+ ccl_gpu_kernel_call(kernel_background_evaluate(NULL, input, output, offset + i));
}
}
/* Curve Shadow Transparency */
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_shader_eval_curve_shadow_transparency(KernelShaderEvalInput *input,
- float *output,
- const int offset,
- const int work_size)
+ ccl_gpu_kernel_signature(shader_eval_curve_shadow_transparency,
+ ccl_global KernelShaderEvalInput *input,
+ ccl_global float *output,
+ const int offset,
+ const int work_size)
{
int i = ccl_gpu_global_id_x();
if (i < work_size) {
- kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i);
+ ccl_gpu_kernel_call(
+ kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i));
}
}
@@ -708,15 +672,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_color_preprocess(float *render_buffer,
- int full_x,
- int full_y,
- int width,
- int height,
- int offset,
- int stride,
- int pass_stride,
- int pass_denoised)
+ ccl_gpu_kernel_signature(filter_color_preprocess,
+ ccl_global float *render_buffer,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int offset,
+ int stride,
+ int pass_stride,
+ int pass_denoised)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / width;
@@ -727,31 +692,32 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
- float *buffer = render_buffer + render_pixel_index * pass_stride;
+ ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
- float *color_out = buffer + pass_denoised;
+ ccl_global float *color_out = buffer + pass_denoised;
color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
- int guiding_pass_stride,
- int guiding_pass_albedo,
- int guiding_pass_normal,
- const float *render_buffer,
- int render_offset,
- int render_stride,
- int render_pass_stride,
- int render_pass_sample_count,
- int render_pass_denoising_albedo,
- int render_pass_denoising_normal,
- int full_x,
- int full_y,
- int width,
- int height,
- int num_samples)
+ ccl_gpu_kernel_signature(filter_guiding_preprocess,
+ ccl_global float *guiding_buffer,
+ int guiding_pass_stride,
+ int guiding_pass_albedo,
+ int guiding_pass_normal,
+ ccl_global const float *render_buffer,
+ int render_offset,
+ int render_stride,
+ int render_pass_stride,
+ int render_pass_sample_count,
+ int render_pass_denoising_albedo,
+ int render_pass_denoising_normal,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int num_samples)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / width;
@@ -762,10 +728,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t guiding_pixel_index = x + y * width;
- float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+ ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
- const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+ ccl_global const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
float pixel_scale;
if (render_pass_sample_count == PASS_UNUSED) {
@@ -779,8 +745,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
if (guiding_pass_albedo != PASS_UNUSED) {
kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
- const float *aledo_in = buffer + render_pass_denoising_albedo;
- float *albedo_out = guiding_pixel + guiding_pass_albedo;
+ ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo;
+ ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
albedo_out[0] = aledo_in[0] * pixel_scale;
albedo_out[1] = aledo_in[1] * pixel_scale;
@@ -791,8 +757,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
if (guiding_pass_normal != PASS_UNUSED) {
kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
- const float *normal_in = buffer + render_pass_denoising_normal;
- float *normal_out = guiding_pixel + guiding_pass_normal;
+ ccl_global const float *normal_in = buffer + render_pass_denoising_normal;
+ ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
normal_out[0] = normal_in[0] * pixel_scale;
normal_out[1] = normal_in[1] * pixel_scale;
@@ -801,11 +767,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
- int guiding_pass_stride,
- int guiding_pass_albedo,
- int width,
- int height)
+ ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo,
+ ccl_global float *guiding_buffer,
+ int guiding_pass_stride,
+ int guiding_pass_albedo,
+ int width,
+ int height)
{
kernel_assert(guiding_pass_albedo != PASS_UNUSED);
@@ -818,9 +785,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t guiding_pixel_index = x + y * width;
- float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+ ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
- float *albedo_out = guiding_pixel + guiding_pass_albedo;
+ ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
albedo_out[0] = 0.5f;
albedo_out[1] = 0.5f;
@@ -828,20 +795,21 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_color_postprocess(float *render_buffer,
- int full_x,
- int full_y,
- int width,
- int height,
- int offset,
- int stride,
- int pass_stride,
- int num_samples,
- int pass_noisy,
- int pass_denoised,
- int pass_sample_count,
- int num_components,
- bool use_compositing)
+ ccl_gpu_kernel_signature(filter_color_postprocess,
+ ccl_global float *render_buffer,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int offset,
+ int stride,
+ int pass_stride,
+ int num_samples,
+ int pass_noisy,
+ int pass_denoised,
+ int pass_sample_count,
+ int num_components,
+ bool use_compositing)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / width;
@@ -852,7 +820,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
- float *buffer = render_buffer + render_pixel_index * pass_stride;
+ ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
float pixel_scale;
if (pass_sample_count == PASS_UNUSED) {
@@ -862,7 +830,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
pixel_scale = __float_as_uint(buffer[pass_sample_count]);
}
- float *denoised_pixel = buffer + pass_denoised;
+ ccl_global float *denoised_pixel = buffer + pass_denoised;
denoised_pixel[0] *= pixel_scale;
denoised_pixel[1] *= pixel_scale;
@@ -875,7 +843,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
/* Currently compositing passes are either 3-component (derived by dividing light passes)
* or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
* simplifies logic and avoids extra memory allocation. */
- const float *noisy_pixel = buffer + pass_noisy;
+ ccl_global const float *noisy_pixel = buffer + pass_noisy;
denoised_pixel[3] = noisy_pixel[3];
}
else {
@@ -891,21 +859,22 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
- uint *num_possible_splits)
+ ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits,
+ int num_states,
+ ccl_global uint *num_possible_splits)
{
const int state = ccl_gpu_global_id_x();
bool can_split = false;
if (state < num_states) {
- can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+ can_split = ccl_gpu_kernel_call(kernel_shadow_catcher_path_can_split(nullptr, state));
}
/* NOTE: All threads specified in the mask must execute the intrinsic. */
- const uint can_split_mask = ccl_gpu_ballot(can_split);
+ const auto can_split_mask = ccl_gpu_ballot(can_split);
const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
if (lane_id == 0) {
- atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+ atomic_fetch_and_add_uint32(num_possible_splits, ccl_gpu_popc(can_split_mask));
}
}