Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorClément Foucault <foucault.clem@gmail.com>2022-01-26 23:57:44 +0300
committerClément Foucault <foucault.clem@gmail.com>2022-01-27 00:03:58 +0300
commit4226c484bdbe7336f1221094916fcdfb12850034 (patch)
tree33428e72be40105c222ca77935ee1554b702facc /intern/cycles/kernel/device/gpu
parent55a6a8900aec81e94f4d82401d6051e3b5507c0e (diff)
parentaf87b6d8cb75d9d625378dee25d726a0d55f75c6 (diff)
Merge branch 'draw-viewport-data' into eevee-rewrite
# Conflicts: # release/scripts/startup/bl_ui/properties_data_camera.py # source/blender/blenkernel/BKE_camera.h # source/blender/blenkernel/BKE_node.h # source/blender/blenkernel/intern/camera.c # source/blender/blenlib/BLI_float2.hh # source/blender/blenlib/BLI_float3.hh # source/blender/blenlib/BLI_float4.hh # source/blender/blenlib/BLI_math_geom.h # source/blender/blenlib/intern/math_geom.c # source/blender/draw/CMakeLists.txt # source/blender/draw/engines/basic/basic_engine.c # source/blender/draw/engines/eevee/eevee_cryptomatte.c # source/blender/draw/engines/eevee/eevee_effects.c # source/blender/draw/engines/eevee/eevee_engine.c # source/blender/draw/engines/eevee/eevee_lightcache.c # source/blender/draw/engines/eevee/eevee_lightcache.h # source/blender/draw/engines/eevee/eevee_lightprobes.c # source/blender/draw/engines/eevee/eevee_lights.c # source/blender/draw/engines/eevee/eevee_materials.c # source/blender/draw/engines/eevee/eevee_motion_blur.c # source/blender/draw/engines/eevee/eevee_occlusion.c # source/blender/draw/engines/eevee/eevee_private.h # source/blender/draw/engines/eevee/eevee_render.c # source/blender/draw/engines/eevee/eevee_renderpasses.c # source/blender/draw/engines/eevee/eevee_sampling.c # source/blender/draw/engines/eevee/eevee_screen_raytrace.c # source/blender/draw/engines/eevee/eevee_shaders.c # source/blender/draw/engines/eevee/eevee_shadows.c # source/blender/draw/engines/eevee/eevee_shadows_cube.c # source/blender/draw/engines/eevee/eevee_temporal_sampling.c # source/blender/draw/engines/eevee/shaders/ambient_occlusion_lib.glsl # source/blender/draw/engines/eevee/shaders/closure_eval_lib.glsl # source/blender/draw/engines/eevee/shaders/common_utiltex_lib.glsl # source/blender/draw/engines/eevee/shaders/effect_dof_bokeh_frag.glsl # source/blender/draw/engines/eevee/shaders/effect_dof_gather_frag.glsl # source/blender/draw/engines/eevee/shaders/effect_dof_reduce_frag.glsl # source/blender/draw/engines/eevee/shaders/effect_reflection_resolve_frag.glsl # source/blender/draw/engines/eevee/shaders/effect_temporal_aa.glsl # source/blender/draw/engines/eevee/shaders/random_lib.glsl # source/blender/draw/engines/eevee/shaders/shadow_vert.glsl # source/blender/draw/engines/eevee/shaders/surface_lib.glsl # source/blender/draw/engines/eevee/shaders/surface_vert.glsl # source/blender/draw/engines/eevee/shaders/volumetric_lib.glsl # source/blender/draw/engines/external/external_engine.c # source/blender/draw/engines/gpencil/gpencil_engine.c # source/blender/draw/engines/image/image_engine.c # source/blender/draw/engines/overlay/overlay_engine.c # source/blender/draw/engines/select/select_engine.c # source/blender/draw/engines/workbench/shaders/workbench_volume_frag.glsl # source/blender/draw/engines/workbench/shaders/workbench_volume_vert.glsl # source/blender/draw/engines/workbench/workbench_engine.c # source/blender/draw/engines/workbench/workbench_shader.c # source/blender/draw/intern/DRW_render.h # source/blender/draw/intern/draw_debug.h # source/blender/draw/intern/draw_manager_data.c # source/blender/draw/intern/draw_manager_exec.c # source/blender/draw/intern/draw_view_data.h # source/blender/gpu/CMakeLists.txt # source/blender/gpu/GPU_material.h # source/blender/gpu/GPU_shader.h # source/blender/gpu/GPU_state.h # source/blender/gpu/GPU_vertex_buffer.h # source/blender/gpu/intern/gpu_codegen.c # source/blender/gpu/intern/gpu_material.c # source/blender/gpu/intern/gpu_material_library.h # source/blender/gpu/intern/gpu_node_graph.c # source/blender/gpu/intern/gpu_texture_private.hh # source/blender/gpu/intern/gpu_vertex_buffer.cc # source/blender/gpu/opengl/gl_shader.cc # source/blender/gpu/shaders/gpu_shader_common_obinfos_lib.glsl # source/blender/gpu/shaders/material/gpu_shader_material_shader_to_rgba.glsl # source/blender/nodes/shader/node_shader_tree.cc # source/blender/nodes/shader/nodes/node_shader_background.cc # source/blender/nodes/shader/nodes/node_shader_bsdf_anisotropic.cc # source/blender/nodes/shader/nodes/node_shader_bsdf_diffuse.cc # source/blender/nodes/shader/nodes/node_shader_bsdf_glass.cc # source/blender/nodes/shader/nodes/node_shader_bsdf_glossy.cc # source/blender/nodes/shader/nodes/node_shader_bsdf_hair.cc # source/blender/nodes/shader/nodes/node_shader_bsdf_hair_principled.c # source/blender/nodes/shader/nodes/node_shader_bsdf_principled.c # source/blender/nodes/shader/nodes/node_shader_bsdf_refraction.cc # source/blender/nodes/shader/nodes/node_shader_bsdf_toon.cc # source/blender/nodes/shader/nodes/node_shader_bsdf_translucent.cc # source/blender/nodes/shader/nodes/node_shader_bsdf_transparent.cc # source/blender/nodes/shader/nodes/node_shader_bsdf_velvet.cc # source/blender/nodes/shader/nodes/node_shader_eevee_specular.cc # source/blender/nodes/shader/nodes/node_shader_emission.cc # source/blender/nodes/shader/nodes/node_shader_holdout.cc # source/blender/nodes/shader/nodes/node_shader_output_material.cc # source/blender/nodes/shader/nodes/node_shader_subsurface_scattering.c # source/blender/nodes/shader/nodes/node_shader_tex_coord.cc # source/blender/nodes/shader/nodes/node_shader_vector_transform.cc # source/blender/nodes/shader/nodes/node_shader_volume_absorption.cc # source/blender/nodes/shader/nodes/node_shader_volume_principled.cc # source/blender/nodes/shader/nodes/node_shader_volume_scatter.cc # source/blender/render/RE_pipeline.h # source/blender/render/intern/initrender.c
Diffstat (limited to 'intern/cycles/kernel/device/gpu')
-rw-r--r--intern/cycles/kernel/device/gpu/image.h16
-rw-r--r--intern/cycles/kernel/device/gpu/kernel.h946
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_active_index.h116
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_prefix_sum.h14
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_sorted_index.h26
-rw-r--r--intern/cycles/kernel/device/gpu/work_stealing.h55
6 files changed, 704 insertions, 469 deletions
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
index b015c78a8f5..0900a45c83d 100644
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -65,7 +65,9 @@ ccl_device float cubic_h1(float a)
/* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
template<typename T>
-ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureInfo &info,
+ float x,
+ float y)
{
ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
@@ -94,7 +96,7 @@ ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, f
/* Fast tricubic texture lookup using 8 trilinear lookups. */
template<typename T>
ccl_device_noinline T
-kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, float y, float z)
{
ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
@@ -169,7 +171,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl
template<typename T>
ccl_device_noinline T kernel_tex_image_interp_nanovdb(
- const TextureInfo &info, float x, float y, float z, uint interpolation)
+ ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
{
using namespace nanovdb;
@@ -189,9 +191,9 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb(
}
#endif
-ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
{
- const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+ ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
/* float4, byte4, ushort4 and half4 */
const int texture_type = info.data_type;
@@ -221,12 +223,12 @@ ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float
}
}
-ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
int id,
float3 P,
InterpolationType interp)
{
- const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+ ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
if (info.use_transform_3d) {
P = transform_point(&info.transform_3d, P);
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 7b79c0aedfa..eed005803e2 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -20,47 +20,64 @@
#include "kernel/device/gpu/parallel_prefix_sum.h"
#include "kernel/device/gpu/parallel_sorted_index.h"
-#include "kernel/integrator/integrator_state.h"
-#include "kernel/integrator/integrator_state_flow.h"
-#include "kernel/integrator/integrator_state_util.h"
-
-#include "kernel/integrator/integrator_init_from_bake.h"
-#include "kernel/integrator/integrator_init_from_camera.h"
-#include "kernel/integrator/integrator_intersect_closest.h"
-#include "kernel/integrator/integrator_intersect_shadow.h"
-#include "kernel/integrator/integrator_intersect_subsurface.h"
-#include "kernel/integrator/integrator_intersect_volume_stack.h"
-#include "kernel/integrator/integrator_shade_background.h"
-#include "kernel/integrator/integrator_shade_light.h"
-#include "kernel/integrator/integrator_shade_shadow.h"
-#include "kernel/integrator/integrator_shade_surface.h"
-#include "kernel/integrator/integrator_shade_volume.h"
-
-#include "kernel/kernel_adaptive_sampling.h"
-#include "kernel/kernel_bake.h"
-#include "kernel/kernel_film.h"
-#include "kernel/kernel_work_stealing.h"
+#include "kernel/sample/lcg.h"
+
+/* Include constant tables before entering Metal's context class scope (context_begin.h) */
+#include "kernel/tables.h"
+
+#ifdef __KERNEL_METAL__
+# include "kernel/device/metal/context_begin.h"
+#endif
+
+#include "kernel/device/gpu/work_stealing.h"
+
+#include "kernel/integrator/state.h"
+#include "kernel/integrator/state_flow.h"
+#include "kernel/integrator/state_util.h"
+
+#include "kernel/integrator/init_from_bake.h"
+#include "kernel/integrator/init_from_camera.h"
+#include "kernel/integrator/intersect_closest.h"
+#include "kernel/integrator/intersect_shadow.h"
+#include "kernel/integrator/intersect_subsurface.h"
+#include "kernel/integrator/intersect_volume_stack.h"
+#include "kernel/integrator/shade_background.h"
+#include "kernel/integrator/shade_light.h"
+#include "kernel/integrator/shade_shadow.h"
+#include "kernel/integrator/shade_surface.h"
+#include "kernel/integrator/shade_volume.h"
+
+#include "kernel/bake/bake.h"
+
+#include "kernel/film/adaptive_sampling.h"
+
+#ifdef __KERNEL_METAL__
+# include "kernel/device/metal/context_end.h"
+#endif
+
+#include "kernel/film/read.h"
/* --------------------------------------------------------------------
* Integrator.
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_reset(int num_states)
+ ccl_gpu_kernel_signature(integrator_reset, int num_states)
{
const int state = ccl_gpu_global_id_x();
if (state < num_states) {
- INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
- INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+ INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
+ INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
- const int num_tiles,
- float *render_buffer,
- const int max_tile_work_size)
+ ccl_gpu_kernel_signature(integrator_init_from_camera,
+ ccl_global KernelWorkTile *tiles,
+ const int num_tiles,
+ ccl_global float *render_buffer,
+ const int max_tile_work_size)
{
const int work_index = ccl_gpu_global_id_x();
@@ -71,7 +88,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int tile_index = work_index / max_tile_work_size;
const int tile_work_index = work_index - tile_index * max_tile_work_size;
- const KernelWorkTile *tile = &tiles[tile_index];
+ ccl_global const KernelWorkTile *tile = &tiles[tile_index];
if (tile_work_index >= tile->work_size) {
return;
@@ -80,16 +97,18 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int state = tile->path_index_offset + tile_work_index;
uint x, y, sample;
- get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+ ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));
- integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+ ccl_gpu_kernel_call(
+ integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample));
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
- const int num_tiles,
- float *render_buffer,
- const int max_tile_work_size)
+ ccl_gpu_kernel_signature(integrator_init_from_bake,
+ ccl_global KernelWorkTile *tiles,
+ const int num_tiles,
+ ccl_global float *render_buffer,
+ const int max_tile_work_size)
{
const int work_index = ccl_gpu_global_id_x();
@@ -100,7 +119,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int tile_index = work_index / max_tile_work_size;
const int tile_work_index = work_index - tile_index * max_tile_work_size;
- const KernelWorkTile *tile = &tiles[tile_index];
+ ccl_global const KernelWorkTile *tile = &tiles[tile_index];
if (tile_work_index >= tile->work_size) {
return;
@@ -109,211 +128,312 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int state = tile->path_index_offset + tile_work_index;
uint x, y, sample;
- get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+ ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));
- integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+ ccl_gpu_kernel_call(
+ integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample));
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_closest,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_closest(NULL, state);
+ ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_shadow,
+ ccl_global const int *path_index_array,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_shadow(NULL, state);
+ ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_subsurface,
+ ccl_global const int *path_index_array,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_subsurface(NULL, state);
+ ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_volume_stack,
+ ccl_global const int *path_index_array,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_volume_stack(NULL, state);
+ ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_background(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_background,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_background(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_light(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_light,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_light(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_shadow(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_shadow,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_shadow(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_surface(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_surface,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_surface(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer));
}
}
+#ifdef __KERNEL_METAL__
+constant int __dummy_constant [[function_constant(0)]];
+#endif
+
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_surface_raytrace(NULL, state, render_buffer);
+
+#ifdef __KERNEL_METAL__
+ KernelGlobals kg = NULL;
+ /* Workaround Ambient Occlusion and Bevel nodes not working with Metal.
+ * Dummy offset should not affect result, but somehow fixes bug! */
+ kg += __dummy_constant;
+ ccl_gpu_kernel_call(integrator_shade_surface_raytrace(kg, state, render_buffer));
+#else
+ ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer));
+#endif
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_volume(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_volume,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_volume(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer));
}
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_queued_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int kernel)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_queued_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int kernel_index)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
+ int kernel_index);
+ ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [kernel](const int state) {
- return (INTEGRATOR_STATE(path, queued_kernel) == kernel);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int kernel)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int kernel_index)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
+ int kernel_index);
+ ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [kernel](const int state) {
- return (INTEGRATOR_STATE(shadow_path, queued_kernel) == kernel);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_active_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [](const int state) {
- return (INTEGRATOR_STATE(path, queued_kernel) != 0) ||
- (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_terminated_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int indices_offset)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_terminated_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int indices_offset)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices + indices_offset, num_indices, [](const int state) {
- return (INTEGRATOR_STATE(path, queued_kernel) == 0) &&
- (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
- });
+ num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_sorted_paths_array(
- int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int indices_offset)
{
- gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
- return (INTEGRATOR_STATE(path, queued_kernel) == kernel) ?
- INTEGRATOR_STATE(path, shader_sort_key) :
- GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
- });
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_sorted_paths_array,
+ int num_states,
+ int num_states_limit,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ ccl_global int *key_counter,
+ ccl_global int *key_prefix_sum,
+ int kernel_index)
+{
+ ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
+ INTEGRATOR_STATE(state, path, shader_sort_key) :
+ GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY,
+ int kernel_index);
+ ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
+ const uint state_index = ccl_gpu_global_id_x();
+ gpu_parallel_sorted_index_array(state_index,
+ num_states,
+ num_states_limit,
+ indices,
+ num_indices,
+ key_counter,
+ key_prefix_sum,
+ ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int num_active_paths)
+{
+ ccl_gpu_kernel_lambda((state >= num_active_paths) &&
+ (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
+ int num_active_paths);
+ ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
+
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_states,
+ ccl_global const int *active_terminated_states,
+ const int active_states_offset,
+ const int terminated_states_offset,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int from_state = active_terminated_states[active_states_offset + global_index];
+ const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+ ccl_gpu_kernel_call(integrator_state_move(NULL, to_state, from_state));
+ }
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_compact_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int num_active_paths)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int num_active_paths)
{
+ ccl_gpu_kernel_lambda((state >= num_active_paths) &&
+ (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
+ int num_active_paths);
+ ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [num_active_paths](const int state) {
- return (state >= num_active_paths) &&
- ((INTEGRATOR_STATE(path, queued_kernel) != 0) ||
- (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0));
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_compact_states(const int *active_terminated_states,
- const int active_states_offset,
- const int terminated_states_offset,
- const int work_size)
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_shadow_states,
+ ccl_global const int *active_terminated_states,
+ const int active_states_offset,
+ const int terminated_states_offset,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
@@ -321,14 +441,14 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
const int from_state = active_terminated_states[active_states_offset + global_index];
const int to_state = active_terminated_states[terminated_states_offset + global_index];
- integrator_state_move(to_state, from_state);
+ ccl_gpu_kernel_call(integrator_shadow_state_move(NULL, to_state, from_state));
}
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
- kernel_gpu_prefix_sum(int *values, int num_values)
+ccl_gpu_kernel_threads(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature(
+ prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values)
{
- gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+ gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values);
}
/* --------------------------------------------------------------------
@@ -336,16 +456,17 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLO
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
- int sx,
- int sy,
- int sw,
- int sh,
- float threshold,
- bool reset,
- int offset,
- int stride,
- uint *num_active_pixels)
+ ccl_gpu_kernel_signature(adaptive_sampling_convergence_check,
+ ccl_global float *render_buffer,
+ int sx,
+ int sy,
+ int sw,
+ int sh,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride,
+ ccl_global uint *num_active_pixels)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / sw;
@@ -354,37 +475,51 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
bool converged = true;
if (x < sw && y < sh) {
- converged = kernel_adaptive_sampling_convergence_check(
- nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+ converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check(
+ nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
}
/* NOTE: All threads specified in the mask must execute the intrinsic. */
- const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+ const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
if (lane_id == 0) {
- atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+ atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_adaptive_sampling_filter_x(
- float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+ ccl_gpu_kernel_signature(adaptive_sampling_filter_x,
+ ccl_global float *render_buffer,
+ int sx,
+ int sy,
+ int sw,
+ int sh,
+ int offset,
+ int stride)
{
const int y = ccl_gpu_global_id_x();
if (y < sh) {
- kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+ ccl_gpu_kernel_call(
+ kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_adaptive_sampling_filter_y(
- float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+ ccl_gpu_kernel_signature(adaptive_sampling_filter_y,
+ ccl_global float *render_buffer,
+ int sx,
+ int sy,
+ int sw,
+ int sh,
+ int offset,
+ int stride)
{
const int x = ccl_gpu_global_id_x();
if (x < sw) {
- kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+ ccl_gpu_kernel_call(
+ kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
}
}
@@ -393,12 +528,14 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+ ccl_gpu_kernel_signature(cryptomatte_postprocess,
+ ccl_global float *render_buffer,
+ int num_pixels)
{
const int pixel_index = ccl_gpu_global_id_x();
if (pixel_index < num_pixels) {
- kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+ ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index));
}
}
@@ -406,202 +543,142 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
* Film.
*/
-/* Common implementation for float destination. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
- float *pixels,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int dst_offset,
- int dst_stride,
- const Processor &processor)
-{
- const int render_pixel_index = ccl_gpu_global_id_x();
- if (render_pixel_index >= num_pixels) {
- return;
- }
-
- const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
- ccl_global const float *buffer = render_buffer + render_buffer_offset;
- ccl_global float *pixel = pixels +
- (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
-
- processor(kfilm_convert, buffer, pixel);
-}
-
-/* Common implementation for half4 destination and 4-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
- const KernelFilmConvert *kfilm_convert,
- uchar4 *rgba,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int rgba_offset,
- int rgba_stride,
- const Processor &processor)
-{
- const int render_pixel_index = ccl_gpu_global_id_x();
- if (render_pixel_index >= num_pixels) {
- return;
- }
-
- const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
- ccl_global const float *buffer = render_buffer + render_buffer_offset;
-
- float pixel[4];
- processor(kfilm_convert, buffer, pixel);
-
- film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
-
- const int x = render_pixel_index % width;
- const int y = render_pixel_index / width;
-
+ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgba,
+ const int rgba_offset,
+ const int rgba_stride,
+ const int x,
+ const int y,
+ const half4 half_pixel)
+{
+ /* Work around HIP issue with half float display, see T92972. */
+#ifdef __KERNEL_HIP__
+ ccl_global half *out = ((ccl_global half *)rgba) + (rgba_offset + y * rgba_stride + x) * 4;
+ out[0] = half_pixel.x;
+ out[1] = half_pixel.y;
+ out[2] = half_pixel.z;
+ out[3] = half_pixel.w;
+#else
ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
- float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
-}
-
-/* Common implementation for half4 destination and 3-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
- const KernelFilmConvert *kfilm_convert,
- uchar4 *rgba,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int rgba_offset,
- int rgba_stride,
- const Processor &processor)
-{
- kernel_gpu_film_convert_half_rgba_common_rgba(
- kfilm_convert,
- rgba,
- render_buffer,
- num_pixels,
- width,
- offset,
- stride,
- rgba_offset,
- rgba_stride,
- [&processor](const KernelFilmConvert *kfilm_convert,
- ccl_global const float *buffer,
- float *pixel_rgba) {
- processor(kfilm_convert, buffer, pixel_rgba);
- pixel_rgba[3] = 1.0f;
- });
-}
-
-/* Common implementation for half4 destination and single channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
- const KernelFilmConvert *kfilm_convert,
- uchar4 *rgba,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int rgba_offset,
- int rgba_stride,
- const Processor &processor)
-{
- kernel_gpu_film_convert_half_rgba_common_rgba(
- kfilm_convert,
- rgba,
- render_buffer,
- num_pixels,
- width,
- offset,
- stride,
- rgba_offset,
- rgba_stride,
- [&processor](const KernelFilmConvert *kfilm_convert,
- ccl_global const float *buffer,
- float *pixel_rgba) {
- float value;
- processor(kfilm_convert, buffer, &value);
-
- pixel_rgba[0] = value;
- pixel_rgba[1] = value;
- pixel_rgba[2] = value;
- pixel_rgba[3] = 1.0f;
- });
-}
-
-#define KERNEL_FILM_CONVERT_PROC(name) \
- ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
-
-#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
- KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
- (const KernelFilmConvert kfilm_convert, \
- float *pixels, \
- float *render_buffer, \
- int num_pixels, \
- int width, \
- int offset, \
- int stride, \
- int rgba_offset, \
- int rgba_stride) \
+ *out = half_pixel;
+#endif
+}
+
+#ifdef __KERNEL_METAL__
+
+/* Fetch into a local variable on Metal - there is minimal overhead. Templating the
+ * film_get_pass_pixel_... functions works on MSL, but not on other compilers. */
+# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
+ float local_pixel[4]; \
+ film_get_pass_pixel_##variant(&kfilm_convert, buffer, local_pixel); \
+ if (input_channel_count >= 1) { \
+ pixel[0] = local_pixel[0]; \
+ } \
+ if (input_channel_count >= 2) { \
+ pixel[1] = local_pixel[1]; \
+ } \
+ if (input_channel_count >= 3) { \
+ pixel[2] = local_pixel[2]; \
+ } \
+ if (input_channel_count >= 4) { \
+ pixel[3] = local_pixel[3]; \
+ }
+
+#else
+
+# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
+ film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel);
+
+#endif
+
+#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
+ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+ ccl_gpu_kernel_signature(film_convert_##variant, \
+ const KernelFilmConvert kfilm_convert, \
+ ccl_global float *pixels, \
+ ccl_global float *render_buffer, \
+ int num_pixels, \
+ int width, \
+ int offset, \
+ int stride, \
+ int rgba_offset, \
+ int rgba_stride) \
{ \
- kernel_gpu_film_convert_common(&kfilm_convert, \
- pixels, \
- render_buffer, \
- num_pixels, \
- width, \
- offset, \
- stride, \
- rgba_offset, \
- rgba_stride, \
- film_get_pass_pixel_##variant); \
+ const int render_pixel_index = ccl_gpu_global_id_x(); \
+ if (render_pixel_index >= num_pixels) { \
+ return; \
+ } \
+\
+ const int x = render_pixel_index % width; \
+ const int y = render_pixel_index / width; \
+\
+ ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+ y * stride * kfilm_convert.pass_stride; \
+\
+ ccl_global float *pixel = pixels + \
+ (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
+\
+ FILM_GET_PASS_PIXEL_F32(variant, input_channel_count); \
} \
- KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
- (const KernelFilmConvert kfilm_convert, \
- uchar4 *rgba, \
- float *render_buffer, \
- int num_pixels, \
- int width, \
- int offset, \
- int stride, \
- int rgba_offset, \
- int rgba_stride) \
+\
+ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+ ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
+ const KernelFilmConvert kfilm_convert, \
+ ccl_global uchar4 *rgba, \
+ ccl_global float *render_buffer, \
+ int num_pixels, \
+ int width, \
+ int offset, \
+ int stride, \
+ int rgba_offset, \
+ int rgba_stride) \
{ \
- kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
- rgba, \
- render_buffer, \
- num_pixels, \
- width, \
- offset, \
- stride, \
- rgba_offset, \
- rgba_stride, \
- film_get_pass_pixel_##variant); \
- }
-
-KERNEL_FILM_CONVERT_DEFINE(depth, value)
-KERNEL_FILM_CONVERT_DEFINE(mist, value)
-KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
-KERNEL_FILM_CONVERT_DEFINE(float, value)
-
-KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
-KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
-
-KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
-KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
-KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
-KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
-
-#undef KERNEL_FILM_CONVERT_DEFINE
-#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
-#undef KERNEL_FILM_CONVERT_PROC
+ const int render_pixel_index = ccl_gpu_global_id_x(); \
+ if (render_pixel_index >= num_pixels) { \
+ return; \
+ } \
+\
+ const int x = render_pixel_index % width; \
+ const int y = render_pixel_index / width; \
+\
+ ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+ y * stride * kfilm_convert.pass_stride; \
+\
+ float pixel[4]; \
+ film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
+\
+ if (input_channel_count == 1) { \
+ pixel[1] = pixel[2] = pixel[0]; \
+ } \
+ if (input_channel_count <= 3) { \
+ pixel[3] = 1.0f; \
+ } \
+\
+ film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+\
+ const half4 half_pixel = float4_to_half4_display( \
+ make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
+ kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); \
+ }
+
+/* 1 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(depth, 1)
+KERNEL_FILM_CONVERT_VARIANT(mist, 1)
+KERNEL_FILM_CONVERT_VARIANT(sample_count, 1)
+KERNEL_FILM_CONVERT_VARIANT(float, 1)
+
+/* 3 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
+KERNEL_FILM_CONVERT_VARIANT(float3, 3)
+
+/* 4 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(motion, 4)
+KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
+KERNEL_FILM_CONVERT_VARIANT(combined, 4)
+KERNEL_FILM_CONVERT_VARIANT(float4, 4)
+
+#undef KERNEL_FILM_CONVERT_VARIANT
/* --------------------------------------------------------------------
* Shader evaluation.
@@ -610,28 +687,46 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
/* Displacement */
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
- float4 *output,
- const int offset,
- const int work_size)
+ ccl_gpu_kernel_signature(shader_eval_displace,
+ ccl_global KernelShaderEvalInput *input,
+ ccl_global float *output,
+ const int offset,
+ const int work_size)
+{
+ int i = ccl_gpu_global_id_x();
+ if (i < work_size) {
+ ccl_gpu_kernel_call(kernel_displace_evaluate(NULL, input, output, offset + i));
+ }
+}
+
+/* Background */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ ccl_gpu_kernel_signature(shader_eval_background,
+ ccl_global KernelShaderEvalInput *input,
+ ccl_global float *output,
+ const int offset,
+ const int work_size)
{
int i = ccl_gpu_global_id_x();
if (i < work_size) {
- kernel_displace_evaluate(NULL, input, output, offset + i);
+ ccl_gpu_kernel_call(kernel_background_evaluate(NULL, input, output, offset + i));
}
}
-/* Background Shader Evaluation */
+/* Curve Shadow Transparency */
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
- float4 *output,
- const int offset,
- const int work_size)
+ ccl_gpu_kernel_signature(shader_eval_curve_shadow_transparency,
+ ccl_global KernelShaderEvalInput *input,
+ ccl_global float *output,
+ const int offset,
+ const int work_size)
{
int i = ccl_gpu_global_id_x();
if (i < work_size) {
- kernel_background_evaluate(NULL, input, output, offset + i);
+ ccl_gpu_kernel_call(
+ kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i));
}
}
@@ -640,15 +735,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_color_preprocess(float *render_buffer,
- int full_x,
- int full_y,
- int width,
- int height,
- int offset,
- int stride,
- int pass_stride,
- int pass_denoised)
+ ccl_gpu_kernel_signature(filter_color_preprocess,
+ ccl_global float *render_buffer,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int offset,
+ int stride,
+ int pass_stride,
+ int pass_denoised)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / width;
@@ -659,31 +755,34 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
- float *buffer = render_buffer + render_pixel_index * pass_stride;
+ ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
- float *color_out = buffer + pass_denoised;
+ ccl_global float *color_out = buffer + pass_denoised;
color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
- int guiding_pass_stride,
- int guiding_pass_albedo,
- int guiding_pass_normal,
- const float *render_buffer,
- int render_offset,
- int render_stride,
- int render_pass_stride,
- int render_pass_sample_count,
- int render_pass_denoising_albedo,
- int render_pass_denoising_normal,
- int full_x,
- int full_y,
- int width,
- int height,
- int num_samples)
+ ccl_gpu_kernel_signature(filter_guiding_preprocess,
+ ccl_global float *guiding_buffer,
+ int guiding_pass_stride,
+ int guiding_pass_albedo,
+ int guiding_pass_normal,
+ int guiding_pass_flow,
+ ccl_global const float *render_buffer,
+ int render_offset,
+ int render_stride,
+ int render_pass_stride,
+ int render_pass_sample_count,
+ int render_pass_denoising_albedo,
+ int render_pass_denoising_normal,
+ int render_pass_motion,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int num_samples)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / width;
@@ -694,10 +793,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t guiding_pixel_index = x + y * width;
- float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+ ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
- const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+ ccl_global const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
float pixel_scale;
if (render_pass_sample_count == PASS_UNUSED) {
@@ -711,8 +810,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
if (guiding_pass_albedo != PASS_UNUSED) {
kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
- const float *aledo_in = buffer + render_pass_denoising_albedo;
- float *albedo_out = guiding_pixel + guiding_pass_albedo;
+ ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo;
+ ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
albedo_out[0] = aledo_in[0] * pixel_scale;
albedo_out[1] = aledo_in[1] * pixel_scale;
@@ -720,24 +819,36 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
/* Normal pass. */
- if (render_pass_denoising_normal != PASS_UNUSED) {
+ if (guiding_pass_normal != PASS_UNUSED) {
kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
- const float *normal_in = buffer + render_pass_denoising_normal;
- float *normal_out = guiding_pixel + guiding_pass_normal;
+ ccl_global const float *normal_in = buffer + render_pass_denoising_normal;
+ ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
normal_out[0] = normal_in[0] * pixel_scale;
normal_out[1] = normal_in[1] * pixel_scale;
normal_out[2] = normal_in[2] * pixel_scale;
}
+
+ /* Flow pass. */
+ if (guiding_pass_flow != PASS_UNUSED) {
+ kernel_assert(render_pass_motion != PASS_UNUSED);
+
+ ccl_global const float *motion_in = buffer + render_pass_motion;
+ ccl_global float *flow_out = guiding_pixel + guiding_pass_flow;
+
+ flow_out[0] = -motion_in[0] * pixel_scale;
+ flow_out[1] = -motion_in[1] * pixel_scale;
+ }
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
- int guiding_pass_stride,
- int guiding_pass_albedo,
- int width,
- int height)
+ ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo,
+ ccl_global float *guiding_buffer,
+ int guiding_pass_stride,
+ int guiding_pass_albedo,
+ int width,
+ int height)
{
kernel_assert(guiding_pass_albedo != PASS_UNUSED);
@@ -750,9 +861,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t guiding_pixel_index = x + y * width;
- float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+ ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
- float *albedo_out = guiding_pixel + guiding_pass_albedo;
+ ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
albedo_out[0] = 0.5f;
albedo_out[1] = 0.5f;
@@ -760,20 +871,21 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_color_postprocess(float *render_buffer,
- int full_x,
- int full_y,
- int width,
- int height,
- int offset,
- int stride,
- int pass_stride,
- int num_samples,
- int pass_noisy,
- int pass_denoised,
- int pass_sample_count,
- int num_components,
- bool use_compositing)
+ ccl_gpu_kernel_signature(filter_color_postprocess,
+ ccl_global float *render_buffer,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int offset,
+ int stride,
+ int pass_stride,
+ int num_samples,
+ int pass_noisy,
+ int pass_denoised,
+ int pass_sample_count,
+ int num_components,
+ bool use_compositing)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / width;
@@ -784,7 +896,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
- float *buffer = render_buffer + render_pixel_index * pass_stride;
+ ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
float pixel_scale;
if (pass_sample_count == PASS_UNUSED) {
@@ -794,7 +906,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
pixel_scale = __float_as_uint(buffer[pass_sample_count]);
}
- float *denoised_pixel = buffer + pass_denoised;
+ ccl_global float *denoised_pixel = buffer + pass_denoised;
denoised_pixel[0] *= pixel_scale;
denoised_pixel[1] *= pixel_scale;
@@ -807,13 +919,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
/* Currently compositing passes are either 3-component (derived by dividing light passes)
* or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
* simplifies logic and avoids extra memory allocation. */
- const float *noisy_pixel = buffer + pass_noisy;
+ ccl_global const float *noisy_pixel = buffer + pass_noisy;
denoised_pixel[3] = noisy_pixel[3];
}
else {
/* Assigning to zero since this is a default alpha value for 3-component passes, and it
* is an opaque pixel for 4 component passes. */
-
denoised_pixel[3] = 0;
}
}
@@ -823,21 +934,22 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
- uint *num_possible_splits)
+ ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits,
+ int num_states,
+ ccl_global uint *num_possible_splits)
{
const int state = ccl_gpu_global_id_x();
bool can_split = false;
if (state < num_states) {
- can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+ can_split = ccl_gpu_kernel_call(kernel_shadow_catcher_path_can_split(nullptr, state));
}
/* NOTE: All threads specified in the mask must execute the intrinsic. */
- const uint can_split_mask = ccl_gpu_ballot(can_split);
+ const auto can_split_mask = ccl_gpu_ballot(can_split);
const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
if (lane_id == 0) {
- atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+ atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask));
}
}
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index db4a4bf71e0..a5320edcb3c 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN
*
* Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */
-#include "util/util_atomic.h"
+#include "util/atomic.h"
#ifdef __HIP__
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
@@ -31,10 +31,43 @@ CCL_NAMESPACE_BEGIN
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
#endif
+#ifdef __KERNEL_METAL__
+struct ActiveIndexContext {
+ ActiveIndexContext(int _thread_index,
+ int _global_index,
+ int _threadgroup_size,
+ int _simdgroup_size,
+ int _simd_lane_index,
+ int _simd_group_index,
+ int _num_simd_groups,
+ threadgroup int *_simdgroup_offset)
+ : thread_index(_thread_index),
+ global_index(_global_index),
+ blocksize(_threadgroup_size),
+ ccl_gpu_warp_size(_simdgroup_size),
+ thread_warp(_simd_lane_index),
+ warp_index(_simd_group_index),
+ num_warps(_num_simd_groups),
+ warp_offset(_simdgroup_offset)
+ {
+ }
+
+ const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
+ num_warps;
+ threadgroup int *warp_offset;
+
+ template<uint blocksizeDummy, typename IsActiveOp>
+ void active_index_array(const uint num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ IsActiveOp is_active_op)
+ {
+ const uint state_index = global_index;
+#else
template<uint blocksize, typename IsActiveOp>
__device__ void gpu_parallel_active_index_array(const uint num_states,
- int *indices,
- int *num_indices,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
IsActiveOp is_active_op)
{
extern ccl_gpu_shared int warp_offset[];
@@ -45,43 +78,62 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
const uint warp_index = thread_index / ccl_gpu_warp_size;
const uint num_warps = blocksize / ccl_gpu_warp_size;
- /* Test if state corresponding to this thread is active. */
const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
- const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#endif
- /* For each thread within a warp compute how many other active states precede it. */
- const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
- const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+ /* Test if state corresponding to this thread is active. */
+ const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
- /* Last thread in warp stores number of active states for each warp. */
- if (thread_warp == ccl_gpu_warp_size - 1) {
- warp_offset[warp_index] = thread_offset + is_active;
- }
+ /* For each thread within a warp compute how many other active states precede it. */
+ const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
+ ccl_gpu_thread_mask(thread_warp));
- ccl_gpu_syncthreads();
-
- /* Last thread in block converts per-warp sizes to offsets, increments global size of
- * index array and gets offset to write to. */
- if (thread_index == blocksize - 1) {
- /* TODO: parallelize this. */
- int offset = 0;
- for (int i = 0; i < num_warps; i++) {
- int num_active = warp_offset[i];
- warp_offset[i] = offset;
- offset += num_active;
+ /* Last thread in warp stores number of active states for each warp. */
+ if (thread_warp == ccl_gpu_warp_size - 1) {
+ warp_offset[warp_index] = thread_offset + is_active;
}
- const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
- warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
- }
+ ccl_gpu_syncthreads();
+
+ /* Last thread in block converts per-warp sizes to offsets, increments global size of
+ * index array and gets offset to write to. */
+ if (thread_index == blocksize - 1) {
+ /* TODO: parallelize this. */
+ int offset = 0;
+ for (int i = 0; i < num_warps; i++) {
+ int num_active = warp_offset[i];
+ warp_offset[i] = offset;
+ offset += num_active;
+ }
+
+ const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+ warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+ }
- ccl_gpu_syncthreads();
+ ccl_gpu_syncthreads();
- /* Write to index array. */
- if (is_active) {
- const uint block_offset = warp_offset[num_warps];
- indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+ /* Write to index array. */
+ if (is_active) {
+ const uint block_offset = warp_offset[num_warps];
+ indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+ }
}
-}
+
+#ifdef __KERNEL_METAL__
+}; /* end class ActiveIndexContext */
+
+/* inject the required thread params into a struct, and redirect to its templated member function
+ */
+# define gpu_parallel_active_index_array \
+ ActiveIndexContext(metal_local_id, \
+ metal_global_id, \
+ metal_local_size, \
+ simdgroup_size, \
+ simd_lane_index, \
+ simd_group_index, \
+ num_simd_groups, \
+ simdgroup_offset) \
+ .active_index_array
+#endif
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index a1349e82efb..4bd002c27e4 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN
* This is used for an array the size of the number of shaders in the scene
* which is not usually huge, so might not be a significant bottleneck. */
-#include "util/util_atomic.h"
+#include "util/atomic.h"
#ifdef __HIP__
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
@@ -33,16 +33,20 @@ CCL_NAMESPACE_BEGIN
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
#endif
-template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+__device__ void gpu_parallel_prefix_sum(const int global_id,
+ ccl_global int *counter,
+ ccl_global int *prefix_sum,
+ const int num_values)
{
- if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+ if (global_id != 0) {
return;
}
int offset = 0;
for (int i = 0; i < num_values; i++) {
- const int new_offset = offset + values[i];
- values[i] = offset;
+ const int new_offset = offset + counter[i];
+ prefix_sum[i] = offset;
+ counter[i] = 0;
offset = new_offset;
}
}
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index 9bca1fad22f..c092e2a21ee 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -24,7 +24,7 @@ CCL_NAMESPACE_BEGIN
*
* TODO: there may be ways to optimize this to avoid this many atomic ops? */
-#include "util/util_atomic.h"
+#include "util/atomic.h"
#ifdef __HIP__
# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
@@ -33,20 +33,30 @@ CCL_NAMESPACE_BEGIN
#endif
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
-template<uint blocksize, typename GetKeyOp>
-__device__ void gpu_parallel_sorted_index_array(const uint num_states,
- int *indices,
- int *num_indices,
- int *key_prefix_sum,
+template<typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint state_index,
+ const uint num_states,
+ const int num_states_limit,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ ccl_global int *key_counter,
+ ccl_global int *key_prefix_sum,
GetKeyOp get_key_op)
{
- const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
const int key = (state_index < num_states) ? get_key_op(state_index) :
GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
- indices[index] = state_index;
+ if (index < num_states_limit) {
+ /* Assign state index. */
+ indices[index] = state_index;
+ }
+ else {
+ /* Can't process this state now, increase the counter again so that
+ * it will be handled in another iteration. */
+ atomic_fetch_and_add_uint32(&key_counter[key], 1);
+ }
}
}
diff --git a/intern/cycles/kernel/device/gpu/work_stealing.h b/intern/cycles/kernel/device/gpu/work_stealing.h
new file mode 100644
index 00000000000..c3083948057
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/work_stealing.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/*
+ * Utility functions for work stealing
+ */
+
+/* Map global work index to tile, pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
+ uint global_work_index,
+ ccl_private uint *x,
+ ccl_private uint *y,
+ ccl_private uint *sample)
+{
+ uint sample_offset, pixel_offset;
+
+ if (kernel_data.integrator.scrambling_distance < 0.9f) {
+ /* Keep threads for the same sample together. */
+ uint tile_pixels = tile->w * tile->h;
+ sample_offset = global_work_index / tile_pixels;
+ pixel_offset = global_work_index - sample_offset * tile_pixels;
+ }
+ else {
+ /* Keeping threads for the same pixel together.
+ * Appears to improve performance by a few % on CUDA and OptiX. */
+ sample_offset = global_work_index % tile->num_samples;
+ pixel_offset = global_work_index / tile->num_samples;
+ }
+
+ uint y_offset = pixel_offset / tile->w;
+ uint x_offset = pixel_offset - y_offset * tile->w;
+
+ *x = tile->x + x_offset;
+ *y = tile->y + y_offset;
+ *sample = tile->start_sample + sample_offset;
+}
+
+CCL_NAMESPACE_END