diff options
Diffstat (limited to 'intern/cycles/kernel/device')
33 files changed, 2480 insertions, 711 deletions
diff --git a/intern/cycles/kernel/device/cpu/compat.h b/intern/cycles/kernel/device/cpu/compat.h index bfd936c7bbd..5ccca52255f 100644 --- a/intern/cycles/kernel/device/cpu/compat.h +++ b/intern/cycles/kernel/device/cpu/compat.h @@ -26,13 +26,11 @@ # pragma GCC diagnostic ignored "-Wuninitialized" #endif -#include "util/util_half.h" -#include "util/util_math.h" -#include "util/util_simd.h" -#include "util/util_texture.h" -#include "util/util_types.h" - -#define ccl_addr_space +#include "util/half.h" +#include "util/math.h" +#include "util/simd.h" +#include "util/texture.h" +#include "util/types.h" /* On x86_64, versions of glibc < 2.16 have an issue where expf is * much slower than the double version. This was fixed in glibc 2.16. diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h index 98b036e269d..746e48b9880 100644 --- a/intern/cycles/kernel/device/cpu/globals.h +++ b/intern/cycles/kernel/device/cpu/globals.h @@ -18,8 +18,9 @@ #pragma once -#include "kernel/kernel_profiling.h" -#include "kernel/kernel_types.h" +#include "kernel/tables.h" +#include "kernel/types.h" +#include "kernel/util/profiling.h" CCL_NAMESPACE_BEGIN @@ -34,9 +35,9 @@ struct OSLThreadData; struct OSLShadingSystem; #endif -typedef struct KernelGlobals { +typedef struct KernelGlobalsCPU { #define KERNEL_TEX(type, name) texture<type> name; -#include "kernel/kernel_textures.h" +#include "kernel/textures.h" KernelData __data; @@ -51,7 +52,9 @@ typedef struct KernelGlobals { /* **** Run-time data **** */ ProfilingState profiler; -} KernelGlobals; +} KernelGlobalsCPU; + +typedef const KernelGlobalsCPU *ccl_restrict KernelGlobals; /* Abstraction macros */ #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index)) diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h index 57e81ab186d..93f956e354d 100644 --- a/intern/cycles/kernel/device/cpu/image.h +++ b/intern/cycles/kernel/device/cpu/image.h @@ -72,12 +72,12 @@ template<typename T> struct TextureInterpolator { static ccl_always_inline float4 read(half4 r) { - return half4_to_float4(r); + return half4_to_float4_image(r); } static ccl_always_inline float4 read(half r) { - float f = half_to_float(r); + float f = half_to_float_image(r); return make_float4(f, f, f, 1.0f); } @@ -583,7 +583,7 @@ template<typename T> struct NanoVDBInterpolator { #undef SET_CUBIC_SPLINE_WEIGHTS -ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y) +ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y) { const TextureInfo &info = kernel_tex_fetch(__texture_info, id); @@ -611,7 +611,7 @@ ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float } } -ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg, +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg, int id, float3 P, InterpolationType interp) diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp index ac1cdf5fffe..a16c637d5ac 100644 --- a/intern/cycles/kernel/device/cpu/kernel.cpp +++ b/intern/cycles/kernel/device/cpu/kernel.cpp @@ -64,7 +64,7 @@ CCL_NAMESPACE_BEGIN /* Memory Copy */ -void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t) +void kernel_const_copy(KernelGlobalsCPU *kg, const char *name, void *host, size_t) { if (strcmp(name, "__data") == 0) { kg->__data = *(KernelData *)host; @@ -74,7 +74,7 @@ void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t) } } -void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size) +void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem, size_t size) { if (0) { } @@ -85,7 +85,7 @@ void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, s kg->tname.data = (type *)mem; \ kg->tname.width = size; \ } -#include "kernel/kernel_textures.h" +#include "kernel/textures.h" else { assert(0); } diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h index ae2a841835a..6af8094b1ea 100644 --- a/intern/cycles/kernel/device/cpu/kernel.h +++ b/intern/cycles/kernel/device/cpu/kernel.h @@ -18,9 +18,10 @@ /* CPU Kernel Interface */ -#include "util/util_types.h" +#include "util/half.h" +#include "util/types.h" -#include "kernel/kernel_types.h" +#include "kernel/types.h" CCL_NAMESPACE_BEGIN @@ -29,17 +30,17 @@ CCL_NAMESPACE_BEGIN #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) struct IntegratorStateCPU; -struct KernelGlobals; +struct KernelGlobalsCPU; struct KernelData; -KernelGlobals *kernel_globals_create(); -void kernel_globals_free(KernelGlobals *kg); +KernelGlobalsCPU *kernel_globals_create(); +void kernel_globals_free(KernelGlobalsCPU *kg); -void *kernel_osl_memory(const KernelGlobals *kg); -bool kernel_osl_use(const KernelGlobals *kg); +void *kernel_osl_memory(const KernelGlobalsCPU *kg); +bool kernel_osl_use(const KernelGlobalsCPU *kg); -void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size); -void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size); +void kernel_const_copy(KernelGlobalsCPU *kg, const char *name, void *host, size_t size); +void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem, size_t size); #define KERNEL_ARCH cpu #include "kernel/device/cpu/kernel_arch.h" diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h index 81f328c710b..61f62f3136b 100644 --- a/intern/cycles/kernel/device/cpu/kernel_arch.h +++ b/intern/cycles/kernel/device/cpu/kernel_arch.h @@ -21,23 +21,23 @@ */ #define KERNEL_INTEGRATOR_FUNCTION(name) \ - void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \ + void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \ IntegratorStateCPU *state) #define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \ - void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \ + void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \ IntegratorStateCPU *state, \ ccl_global float *render_buffer) #define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \ - bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \ + bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \ IntegratorStateCPU *state, \ KernelWorkTile *tile, \ ccl_global float *render_buffer) KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera); KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake); -KERNEL_INTEGRATOR_FUNCTION(intersect_closest); +KERNEL_INTEGRATOR_SHADE_FUNCTION(intersect_closest); KERNEL_INTEGRATOR_FUNCTION(intersect_shadow); KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface); KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack); @@ -52,25 +52,61 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel); #undef KERNEL_INTEGRATOR_INIT_FUNCTION #undef KERNEL_INTEGRATOR_SHADE_FUNCTION +#define KERNEL_FILM_CONVERT_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \ + const float *buffer, \ + float *pixel, \ + const int width, \ + const int buffer_stride, \ + const int pixel_stride); \ + void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \ + const KernelFilmConvert *kfilm_convert, \ + const float *buffer, \ + half4 *pixel, \ + const int width, \ + const int buffer_stride); + +KERNEL_FILM_CONVERT_FUNCTION(depth) +KERNEL_FILM_CONVERT_FUNCTION(mist) +KERNEL_FILM_CONVERT_FUNCTION(sample_count) +KERNEL_FILM_CONVERT_FUNCTION(float) + +KERNEL_FILM_CONVERT_FUNCTION(light_path) +KERNEL_FILM_CONVERT_FUNCTION(float3) + +KERNEL_FILM_CONVERT_FUNCTION(motion) +KERNEL_FILM_CONVERT_FUNCTION(cryptomatte) +KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher) +KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow) +KERNEL_FILM_CONVERT_FUNCTION(combined) +KERNEL_FILM_CONVERT_FUNCTION(float4) + +#undef KERNEL_FILM_CONVERT_FUNCTION + /* -------------------------------------------------------------------- * Shader evaluation. */ -void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg, +void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg, const KernelShaderEvalInput *input, - float4 *output, + float *output, const int offset); -void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg, +void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg, const KernelShaderEvalInput *input, - float4 *output, + float *output, const int offset); +void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)( + const KernelGlobalsCPU *kg, + const KernelShaderEvalInput *input, + float *output, + const int offset); /* -------------------------------------------------------------------- * Adaptive sampling. */ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)( - const KernelGlobals *kg, + const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int x, int y, @@ -79,14 +115,14 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)( int offset, int stride); -void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg, +void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int y, int start_x, int width, int offset, int stride); -void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg, +void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int x, int start_y, @@ -98,16 +134,8 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals * * Cryptomatte. */ -void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg, +void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int pixel_index); -/* -------------------------------------------------------------------- - * Bake. - */ -/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */ - -void KERNEL_FUNCTION_FULL_NAME(bake)( - const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride); - #undef KERNEL_ARCH diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h index 1432abfd330..747c47c34c9 100644 --- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h +++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h @@ -29,27 +29,28 @@ # include "kernel/device/cpu/globals.h" # include "kernel/device/cpu/image.h" -# include "kernel/integrator/integrator_state.h" -# include "kernel/integrator/integrator_state_flow.h" -# include "kernel/integrator/integrator_state_util.h" - -# include "kernel/integrator/integrator_init_from_camera.h" -# include "kernel/integrator/integrator_init_from_bake.h" -# include "kernel/integrator/integrator_intersect_closest.h" -# include "kernel/integrator/integrator_intersect_shadow.h" -# include "kernel/integrator/integrator_intersect_subsurface.h" -# include "kernel/integrator/integrator_intersect_volume_stack.h" -# include "kernel/integrator/integrator_shade_background.h" -# include "kernel/integrator/integrator_shade_light.h" -# include "kernel/integrator/integrator_shade_shadow.h" -# include "kernel/integrator/integrator_shade_surface.h" -# include "kernel/integrator/integrator_shade_volume.h" -# include "kernel/integrator/integrator_megakernel.h" - -# include "kernel/kernel_film.h" -# include "kernel/kernel_adaptive_sampling.h" -# include "kernel/kernel_bake.h" -# include "kernel/kernel_id_passes.h" +# include "kernel/integrator/state.h" +# include "kernel/integrator/state_flow.h" +# include "kernel/integrator/state_util.h" + +# include "kernel/integrator/init_from_camera.h" +# include "kernel/integrator/init_from_bake.h" +# include "kernel/integrator/intersect_closest.h" +# include "kernel/integrator/intersect_shadow.h" +# include "kernel/integrator/intersect_subsurface.h" +# include "kernel/integrator/intersect_volume_stack.h" +# include "kernel/integrator/shade_background.h" +# include "kernel/integrator/shade_light.h" +# include "kernel/integrator/shade_shadow.h" +# include "kernel/integrator/shade_surface.h" +# include "kernel/integrator/shade_volume.h" +# include "kernel/integrator/megakernel.h" + +# include "kernel/film/adaptive_sampling.h" +# include "kernel/film/id_passes.h" +# include "kernel/film/read.h" + +# include "kernel/bake/bake.h" #else # define STUB_ASSERT(arch, name) \ @@ -69,8 +70,20 @@ CCL_NAMESPACE_BEGIN # define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__) #endif +/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so + * that it does not contain unused fields. */ +#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \ + bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \ + IntegratorStateCPU *state, \ + KernelWorkTile *tile, \ + ccl_global float *render_buffer) \ + { \ + return KERNEL_INVOKE( \ + name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \ + } + #define DEFINE_INTEGRATOR_KERNEL(name) \ - void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \ + void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \ IntegratorStateCPU *state) \ { \ KERNEL_INVOKE(name, kg, state); \ @@ -78,43 +91,45 @@ CCL_NAMESPACE_BEGIN #define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \ - const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \ + const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \ { \ KERNEL_INVOKE(name, kg, state, render_buffer); \ } -/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so - * that it does not contain unused fields. */ -#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \ - bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \ - IntegratorStateCPU *state, \ - KernelWorkTile *tile, \ - ccl_global float *render_buffer) \ +#define DEFINE_INTEGRATOR_SHADOW_KERNEL(name) \ + void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \ + IntegratorStateCPU *state) \ { \ - return KERNEL_INVOKE( \ - name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \ + KERNEL_INVOKE(name, kg, &state->shadow); \ + } + +#define DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(name) \ + void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \ + const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \ + { \ + KERNEL_INVOKE(name, kg, &state->shadow, render_buffer); \ } DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera) DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake) -DEFINE_INTEGRATOR_KERNEL(intersect_closest) -DEFINE_INTEGRATOR_KERNEL(intersect_shadow) +DEFINE_INTEGRATOR_SHADE_KERNEL(intersect_closest) DEFINE_INTEGRATOR_KERNEL(intersect_subsurface) DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack) DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background) DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light) -DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow) DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface) DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume) DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel) +DEFINE_INTEGRATOR_SHADOW_KERNEL(intersect_shadow) +DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(shade_shadow) /* -------------------------------------------------------------------- * Shader evaluation. */ -void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg, +void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg, const KernelShaderEvalInput *input, - float4 *output, + float *output, const int offset) { #ifdef KERNEL_STUB @@ -124,9 +139,9 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg, #endif } -void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg, +void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg, const KernelShaderEvalInput *input, - float4 *output, + float *output, const int offset) { #ifdef KERNEL_STUB @@ -136,12 +151,25 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg, #endif } +void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)( + const KernelGlobalsCPU *kg, + const KernelShaderEvalInput *input, + float *output, + const int offset) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, shader_eval_curve_shadow_transparency); +#else + kernel_curve_shadow_transparency_evaluate(kg, input, output, offset); +#endif +} + /* -------------------------------------------------------------------- * Adaptive sampling. */ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)( - const KernelGlobals *kg, + const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int x, int y, @@ -159,7 +187,7 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)( #endif } -void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg, +void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int y, int start_x, @@ -174,7 +202,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals * #endif } -void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg, +void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int x, int start_y, @@ -193,7 +221,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals * * Cryptomatte. */ -void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg, +void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int pixel_index) { @@ -205,23 +233,83 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg, } /* -------------------------------------------------------------------- - * Bake. + * Film Convert. */ -/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */ -void KERNEL_FUNCTION_FULL_NAME(bake)( - const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride) -{ -#if 0 -# ifdef KERNEL_STUB - STUB_ASSERT(KERNEL_ARCH, bake); -# else -# ifdef __BAKING__ - kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride); -# endif -# endif /* KERNEL_STUB */ +#ifdef KERNEL_STUB + +# define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \ + void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \ + const float *buffer, \ + float *pixel, \ + const int width, \ + const int buffer_stride, \ + const int pixel_stride) \ + { \ + STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \ + } \ + void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \ + const KernelFilmConvert *kfilm_convert, \ + const float *buffer, \ + half4 *pixel, \ + const int width, \ + const int buffer_stride) \ + { \ + STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \ + } + +#else + +# define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \ + void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \ + const float *buffer, \ + float *pixel, \ + const int width, \ + const int buffer_stride, \ + const int pixel_stride) \ + { \ + for (int i = 0; i < width; i++, buffer += buffer_stride, pixel += pixel_stride) { \ + film_get_pass_pixel_##name(kfilm_convert, buffer, pixel); \ + } \ + } \ + void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \ + const KernelFilmConvert *kfilm_convert, \ + const float *buffer, \ + half4 *pixel, \ + const int width, \ + const int buffer_stride) \ + { \ + for (int i = 0; i < width; i++, buffer += buffer_stride, pixel++) { \ + float pixel_rgba[4] = {0.0f, 0.0f, 0.0f, 1.0f}; \ + film_get_pass_pixel_##name(kfilm_convert, buffer, pixel_rgba); \ + if (is_float) { \ + pixel_rgba[1] = pixel_rgba[0]; \ + pixel_rgba[2] = pixel_rgba[0]; \ + } \ + film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba); \ + *pixel = float4_to_half4_display( \ + make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3])); \ + } \ + } + #endif -} + +KERNEL_FILM_CONVERT_FUNCTION(depth, true) +KERNEL_FILM_CONVERT_FUNCTION(mist, true) +KERNEL_FILM_CONVERT_FUNCTION(sample_count, true) +KERNEL_FILM_CONVERT_FUNCTION(float, true) + +KERNEL_FILM_CONVERT_FUNCTION(light_path, false) +KERNEL_FILM_CONVERT_FUNCTION(float3, false) + +KERNEL_FILM_CONVERT_FUNCTION(motion, false) +KERNEL_FILM_CONVERT_FUNCTION(cryptomatte, false) +KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher, false) +KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow, false) +KERNEL_FILM_CONVERT_FUNCTION(combined, false) +KERNEL_FILM_CONVERT_FUNCTION(float4, false) + +#undef KERNEL_FILM_CONVERT_FUNCTION #undef KERNEL_INVOKE #undef DEFINE_INTEGRATOR_KERNEL diff --git a/intern/cycles/kernel/device/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp index 220768036ab..cece750a255 100644 --- a/intern/cycles/kernel/device/cpu/kernel_avx.cpp +++ b/intern/cycles/kernel/device/cpu/kernel_avx.cpp @@ -18,7 +18,7 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -#include "util/util_optimization.h" +#include "util/optimization.h" #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX # define KERNEL_STUB diff --git a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp index 90c05113cbe..fad4581236e 100644 --- a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp +++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp @@ -18,7 +18,7 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -#include "util/util_optimization.h" +#include "util/optimization.h" #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 # define KERNEL_STUB diff --git a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp index fb85ef5b0d0..5fb4849ac08 100644 --- a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp +++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp @@ -18,7 +18,7 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -#include "util/util_optimization.h" +#include "util/optimization.h" #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 # define KERNEL_STUB diff --git a/intern/cycles/kernel/device/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp index 87baf04258a..c9424682fd4 100644 --- a/intern/cycles/kernel/device/cpu/kernel_sse3.cpp +++ b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp @@ -18,7 +18,7 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -#include "util/util_optimization.h" +#include "util/optimization.h" #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 # define KERNEL_STUB diff --git a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp index bb421d58815..849ebf51989 100644 --- a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp +++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp @@ -18,7 +18,7 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -#include "util/util_optimization.h" +#include "util/optimization.h" #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 # define KERNEL_STUB diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h index 3c85a8e7bd2..658dec102b1 100644 --- a/intern/cycles/kernel/device/cuda/compat.h +++ b/intern/cycles/kernel/device/cuda/compat.h @@ -52,14 +52,14 @@ typedef unsigned long long uint64_t; #endif #define ccl_device_noinline __device__ __noinline__ #define ccl_device_noinline_cpu ccl_device +#define ccl_device_inline_method ccl_device #define ccl_global -#define ccl_static_constant __constant__ +#define ccl_inline_constant __constant__ #define ccl_device_constant __constant__ __device__ #define ccl_constant const #define ccl_gpu_shared __shared__ #define ccl_private #define ccl_may_alias -#define ccl_addr_space #define ccl_restrict __restrict__ #define ccl_loop_no_unroll #define ccl_align(n) __align__(n) @@ -76,6 +76,7 @@ typedef unsigned long long uint64_t; #define ccl_gpu_block_idx_x (blockIdx.x) #define ccl_gpu_grid_dim_x (gridDim.x) #define ccl_gpu_warp_size (warpSize) +#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) @@ -85,7 +86,6 @@ typedef unsigned long long uint64_t; #define ccl_gpu_syncthreads() __syncthreads() #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate) #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla) -#define ccl_gpu_popc(x) __popc(x) /* GPU texture objects */ @@ -129,7 +129,14 @@ __device__ half __float2half(const float f) return val; } +__device__ float __half2float(const half h) +{ + float val; + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h)); + return val; +} + /* Types */ -#include "util/util_half.h" -#include "util/util_types.h" +#include "util/half.h" +#include "util/types.h" diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h index 46196dcdb51..003881d7912 100644 --- a/intern/cycles/kernel/device/cuda/config.h +++ b/intern/cycles/kernel/device/cuda/config.h @@ -92,12 +92,29 @@ /* Compute number of threads per block and minimum blocks per multiprocessor * given the maximum number of registers per thread. */ - #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \ extern "C" __global__ void __launch_bounds__(block_num_threads, \ GPU_MULTIPRESSOR_MAX_REGISTERS / \ (block_num_threads * thread_num_registers)) +#define ccl_gpu_kernel_threads(block_num_threads) \ + extern "C" __global__ void __launch_bounds__(block_num_threads) + +#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__) + +#define ccl_gpu_kernel_call(x) x + +/* Define a function object where "func" is the lambda body, and additional parameters are used to + * specify captured state */ +#define ccl_gpu_kernel_lambda(func, ...) \ + struct KernelLambda { \ + __VA_ARGS__; \ + __device__ int operator()(const int state) \ + { \ + return (func); \ + } \ + } ccl_gpu_kernel_lambda_pass + /* sanity checks */ #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS diff --git a/intern/cycles/kernel/device/cuda/globals.h b/intern/cycles/kernel/device/cuda/globals.h index 169047175f5..e5023fad40c 100644 --- a/intern/cycles/kernel/device/cuda/globals.h +++ b/intern/cycles/kernel/device/cuda/globals.h @@ -18,23 +18,25 @@ #pragma once -#include "kernel/kernel_profiling.h" -#include "kernel/kernel_types.h" +#include "kernel/types.h" -#include "kernel/integrator/integrator_state.h" +#include "kernel/integrator/state.h" + +#include "kernel/util/profiling.h" CCL_NAMESPACE_BEGIN /* Not actually used, just a NULL pointer that gets passed everywhere, which we * hope gets optimized out by the compiler. */ -struct KernelGlobals { +struct KernelGlobalsGPU { int unused[1]; }; +typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals; /* Global scene data and textures */ __constant__ KernelData __data; #define KERNEL_TEX(type, name) const __constant__ __device__ type *name; -#include "kernel/kernel_textures.h" +#include "kernel/textures.h" /* Integrator state */ __constant__ IntegratorStateGPU __integrator_state; diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h index b015c78a8f5..0900a45c83d 100644 --- a/intern/cycles/kernel/device/gpu/image.h +++ b/intern/cycles/kernel/device/gpu/image.h @@ -65,7 +65,9 @@ ccl_device float cubic_h1(float a) /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */ template<typename T> -ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y) +ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureInfo &info, + float x, + float y) { ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; @@ -94,7 +96,7 @@ ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, f /* Fast tricubic texture lookup using 8 trilinear lookups. */ template<typename T> ccl_device_noinline T -kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z) +kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, float y, float z) { ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; @@ -169,7 +171,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl template<typename T> ccl_device_noinline T kernel_tex_image_interp_nanovdb( - const TextureInfo &info, float x, float y, float z, uint interpolation) + ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation) { using namespace nanovdb; @@ -189,9 +191,9 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb( } #endif -ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y) +ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y) { - const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id); /* float4, byte4, ushort4 and half4 */ const int texture_type = info.data_type; @@ -221,12 +223,12 @@ ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float } } -ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg, +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg, int id, float3 P, InterpolationType interp) { - const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id); if (info.use_transform_3d) { P = transform_point(&info.transform_3d, P); diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index 7b79c0aedfa..eed005803e2 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -20,47 +20,64 @@ #include "kernel/device/gpu/parallel_prefix_sum.h" #include "kernel/device/gpu/parallel_sorted_index.h" -#include "kernel/integrator/integrator_state.h" -#include "kernel/integrator/integrator_state_flow.h" -#include "kernel/integrator/integrator_state_util.h" - -#include "kernel/integrator/integrator_init_from_bake.h" -#include "kernel/integrator/integrator_init_from_camera.h" -#include "kernel/integrator/integrator_intersect_closest.h" -#include "kernel/integrator/integrator_intersect_shadow.h" -#include "kernel/integrator/integrator_intersect_subsurface.h" -#include "kernel/integrator/integrator_intersect_volume_stack.h" -#include "kernel/integrator/integrator_shade_background.h" -#include "kernel/integrator/integrator_shade_light.h" -#include "kernel/integrator/integrator_shade_shadow.h" -#include "kernel/integrator/integrator_shade_surface.h" -#include "kernel/integrator/integrator_shade_volume.h" - -#include "kernel/kernel_adaptive_sampling.h" -#include "kernel/kernel_bake.h" -#include "kernel/kernel_film.h" -#include "kernel/kernel_work_stealing.h" +#include "kernel/sample/lcg.h" + +/* Include constant tables before entering Metal's context class scope (context_begin.h) */ +#include "kernel/tables.h" + +#ifdef __KERNEL_METAL__ +# include "kernel/device/metal/context_begin.h" +#endif + +#include "kernel/device/gpu/work_stealing.h" + +#include "kernel/integrator/state.h" +#include "kernel/integrator/state_flow.h" +#include "kernel/integrator/state_util.h" + +#include "kernel/integrator/init_from_bake.h" +#include "kernel/integrator/init_from_camera.h" +#include "kernel/integrator/intersect_closest.h" +#include "kernel/integrator/intersect_shadow.h" +#include "kernel/integrator/intersect_subsurface.h" +#include "kernel/integrator/intersect_volume_stack.h" +#include "kernel/integrator/shade_background.h" +#include "kernel/integrator/shade_light.h" +#include "kernel/integrator/shade_shadow.h" +#include "kernel/integrator/shade_surface.h" +#include "kernel/integrator/shade_volume.h" + +#include "kernel/bake/bake.h" + +#include "kernel/film/adaptive_sampling.h" + +#ifdef __KERNEL_METAL__ +# include "kernel/device/metal/context_end.h" +#endif + +#include "kernel/film/read.h" /* -------------------------------------------------------------------- * Integrator. */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_reset(int num_states) + ccl_gpu_kernel_signature(integrator_reset, int num_states) { const int state = ccl_gpu_global_id_x(); if (state < num_states) { - INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0; - INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0; + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; + INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles, - const int num_tiles, - float *render_buffer, - const int max_tile_work_size) + ccl_gpu_kernel_signature(integrator_init_from_camera, + ccl_global KernelWorkTile *tiles, + const int num_tiles, + ccl_global float *render_buffer, + const int max_tile_work_size) { const int work_index = ccl_gpu_global_id_x(); @@ -71,7 +88,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const int tile_index = work_index / max_tile_work_size; const int tile_work_index = work_index - tile_index * max_tile_work_size; - const KernelWorkTile *tile = &tiles[tile_index]; + ccl_global const KernelWorkTile *tile = &tiles[tile_index]; if (tile_work_index >= tile->work_size) { return; @@ -80,16 +97,18 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const int state = tile->path_index_offset + tile_work_index; uint x, y, sample; - get_work_pixel(tile, tile_work_index, &x, &y, &sample); + ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample)); - integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample); + ccl_gpu_kernel_call( + integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample)); } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles, - const int num_tiles, - float *render_buffer, - const int max_tile_work_size) + ccl_gpu_kernel_signature(integrator_init_from_bake, + ccl_global KernelWorkTile *tiles, + const int num_tiles, + ccl_global float *render_buffer, + const int max_tile_work_size) { const int work_index = ccl_gpu_global_id_x(); @@ -100,7 +119,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const int tile_index = work_index / max_tile_work_size; const int tile_work_index = work_index - tile_index * max_tile_work_size; - const KernelWorkTile *tile = &tiles[tile_index]; + ccl_global const KernelWorkTile *tile = &tiles[tile_index]; if (tile_work_index >= tile->work_size) { return; @@ -109,211 +128,312 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const int state = tile->path_index_offset + tile_work_index; uint x, y, sample; - get_work_pixel(tile, tile_work_index, &x, &y, &sample); + ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample)); - integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample); + ccl_gpu_kernel_call( + integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample)); } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size) + ccl_gpu_kernel_signature(integrator_intersect_closest, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_intersect_closest(NULL, state); + ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size) + ccl_gpu_kernel_signature(integrator_intersect_shadow, + ccl_global const int *path_index_array, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_intersect_shadow(NULL, state); + ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size) + ccl_gpu_kernel_signature(integrator_intersect_subsurface, + ccl_global const int *path_index_array, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_intersect_subsurface(NULL, state); + ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size) + ccl_gpu_kernel_signature(integrator_intersect_volume_stack, + ccl_global const int *path_index_array, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_intersect_volume_stack(NULL, state); + ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_background(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_background, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_background(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_light(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_light, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_light(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_shadow(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_shadow, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_shadow(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_surface(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_surface, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_surface(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer)); } } +#ifdef __KERNEL_METAL__ +constant int __dummy_constant [[function_constant(0)]]; +#endif + ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_surface_raytrace, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_surface_raytrace(NULL, state, render_buffer); + +#ifdef __KERNEL_METAL__ + KernelGlobals kg = NULL; + /* Workaround Ambient Occlusion and Bevel nodes not working with Metal. + * Dummy offset should not affect result, but somehow fixes bug! */ + kg += __dummy_constant; + ccl_gpu_kernel_call(integrator_shade_surface_raytrace(kg, state, render_buffer)); +#else + ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer)); +#endif } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_volume(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_volume, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_volume(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer)); } } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_queued_paths_array(int num_states, - int *indices, - int *num_indices, - int kernel) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_queued_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int kernel_index) { + ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index, + int kernel_index); + ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index; + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, [kernel](const int state) { - return (INTEGRATOR_STATE(path, queued_kernel) == kernel); - }); + num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_queued_shadow_paths_array(int num_states, - int *indices, - int *num_indices, - int kernel) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int kernel_index) { + ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index, + int kernel_index); + ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index; + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, [kernel](const int state) { - return (INTEGRATOR_STATE(shadow_path, queued_kernel) == kernel); - }); + num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_active_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices) { + ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0); + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, [](const int state) { - return (INTEGRATOR_STATE(path, queued_kernel) != 0) || - (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0); - }); + num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_terminated_paths_array(int num_states, - int *indices, - int *num_indices, - int indices_offset) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_terminated_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int indices_offset) { + ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0); + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices + indices_offset, num_indices, [](const int state) { - return (INTEGRATOR_STATE(path, queued_kernel) == 0) && - (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0); - }); + num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_sorted_paths_array( - int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int indices_offset) { - gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) { - return (INTEGRATOR_STATE(path, queued_kernel) == kernel) ? - INTEGRATOR_STATE(path, shader_sort_key) : - GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY; - }); + ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0); + + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass); +} + +ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_sorted_paths_array, + int num_states, + int num_states_limit, + ccl_global int *indices, + ccl_global int *num_indices, + ccl_global int *key_counter, + ccl_global int *key_prefix_sum, + int kernel_index) +{ + ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ? + INTEGRATOR_STATE(state, path, shader_sort_key) : + GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY, + int kernel_index); + ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index; + + const uint state_index = ccl_gpu_global_id_x(); + gpu_parallel_sorted_index_array(state_index, + num_states, + num_states_limit, + indices, + num_indices, + key_counter, + key_prefix_sum, + ccl_gpu_kernel_lambda_pass); +} + +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_compact_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int num_active_paths) +{ + ccl_gpu_kernel_lambda((state >= num_active_paths) && + (INTEGRATOR_STATE(state, path, queued_kernel) != 0), + int num_active_paths); + ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths; + + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); +} + +ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_compact_states, + ccl_global const int *active_terminated_states, + const int active_states_offset, + const int terminated_states_offset, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int from_state = active_terminated_states[active_states_offset + global_index]; + const int to_state = active_terminated_states[terminated_states_offset + global_index]; + + ccl_gpu_kernel_call(integrator_state_move(NULL, to_state, from_state)); + } } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_compact_paths_array(int num_states, - int *indices, - int *num_indices, - int num_active_paths) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int num_active_paths) { + ccl_gpu_kernel_lambda((state >= num_active_paths) && + (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0), + int num_active_paths); + ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths; + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, [num_active_paths](const int state) { - return (state >= num_active_paths) && - ((INTEGRATOR_STATE(path, queued_kernel) != 0) || - (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0)); - }); + num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_compact_states(const int *active_terminated_states, - const int active_states_offset, - const int terminated_states_offset, - const int work_size) +ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_compact_shadow_states, + ccl_global const int *active_terminated_states, + const int active_states_offset, + const int terminated_states_offset, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); @@ -321,14 +441,14 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B const int from_state = active_terminated_states[active_states_offset + global_index]; const int to_state = active_terminated_states[terminated_states_offset + global_index]; - integrator_state_move(to_state, from_state); + ccl_gpu_kernel_call(integrator_shadow_state_move(NULL, to_state, from_state)); } } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) - kernel_gpu_prefix_sum(int *values, int num_values) +ccl_gpu_kernel_threads(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature( + prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values) { - gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values); + gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values); } /* -------------------------------------------------------------------- @@ -336,16 +456,17 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLO */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer, - int sx, - int sy, - int sw, - int sh, - float threshold, - bool reset, - int offset, - int stride, - uint *num_active_pixels) + ccl_gpu_kernel_signature(adaptive_sampling_convergence_check, + ccl_global float *render_buffer, + int sx, + int sy, + int sw, + int sh, + float threshold, + bool reset, + int offset, + int stride, + ccl_global uint *num_active_pixels) { const int work_index = ccl_gpu_global_id_x(); const int y = work_index / sw; @@ -354,37 +475,51 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) bool converged = true; if (x < sw && y < sh) { - converged = kernel_adaptive_sampling_convergence_check( - nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride); + converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check( + nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride)); } /* NOTE: All threads specified in the mask must execute the intrinsic. */ - const uint num_active_pixels_mask = ccl_gpu_ballot(!converged); + const auto num_active_pixels_mask = ccl_gpu_ballot(!converged); const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size; if (lane_id == 0) { - atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask)); + atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_adaptive_sampling_filter_x( - float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride) + ccl_gpu_kernel_signature(adaptive_sampling_filter_x, + ccl_global float *render_buffer, + int sx, + int sy, + int sw, + int sh, + int offset, + int stride) { const int y = ccl_gpu_global_id_x(); if (y < sh) { - kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride); + ccl_gpu_kernel_call( + kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_adaptive_sampling_filter_y( - float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride) + ccl_gpu_kernel_signature(adaptive_sampling_filter_y, + ccl_global float *render_buffer, + int sx, + int sy, + int sw, + int sh, + int offset, + int stride) { const int x = ccl_gpu_global_id_x(); if (x < sw) { - kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride); + ccl_gpu_kernel_call( + kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride)); } } @@ -393,12 +528,14 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels) + ccl_gpu_kernel_signature(cryptomatte_postprocess, + ccl_global float *render_buffer, + int num_pixels) { const int pixel_index = ccl_gpu_global_id_x(); if (pixel_index < num_pixels) { - kernel_cryptomatte_post(nullptr, render_buffer, pixel_index); + ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index)); } } @@ -406,202 +543,142 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) * Film. */ -/* Common implementation for float destination. */ -template<typename Processor> -ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert, - float *pixels, - float *render_buffer, - int num_pixels, - int width, - int offset, - int stride, - int dst_offset, - int dst_stride, - const Processor &processor) -{ - const int render_pixel_index = ccl_gpu_global_id_x(); - if (render_pixel_index >= num_pixels) { - return; - } - - const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride; - ccl_global const float *buffer = render_buffer + render_buffer_offset; - ccl_global float *pixel = pixels + - (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride; - - processor(kfilm_convert, buffer, pixel); -} - -/* Common implementation for half4 destination and 4-channel input pass. */ -template<typename Processor> -ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba( - const KernelFilmConvert *kfilm_convert, - uchar4 *rgba, - float *render_buffer, - int num_pixels, - int width, - int offset, - int stride, - int rgba_offset, - int rgba_stride, - const Processor &processor) -{ - const int render_pixel_index = ccl_gpu_global_id_x(); - if (render_pixel_index >= num_pixels) { - return; - } - - const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride; - ccl_global const float *buffer = render_buffer + render_buffer_offset; - - float pixel[4]; - processor(kfilm_convert, buffer, pixel); - - film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel); - - const int x = render_pixel_index % width; - const int y = render_pixel_index / width; - +ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgba, + const int rgba_offset, + const int rgba_stride, + const int x, + const int y, + const half4 half_pixel) +{ + /* Work around HIP issue with half float display, see T92972. */ +#ifdef __KERNEL_HIP__ + ccl_global half *out = ((ccl_global half *)rgba) + (rgba_offset + y * rgba_stride + x) * 4; + out[0] = half_pixel.x; + out[1] = half_pixel.y; + out[2] = half_pixel.z; + out[3] = half_pixel.w; +#else ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x; - float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); -} - -/* Common implementation for half4 destination and 3-channel input pass. */ -template<typename Processor> -ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb( - const KernelFilmConvert *kfilm_convert, - uchar4 *rgba, - float *render_buffer, - int num_pixels, - int width, - int offset, - int stride, - int rgba_offset, - int rgba_stride, - const Processor &processor) -{ - kernel_gpu_film_convert_half_rgba_common_rgba( - kfilm_convert, - rgba, - render_buffer, - num_pixels, - width, - offset, - stride, - rgba_offset, - rgba_stride, - [&processor](const KernelFilmConvert *kfilm_convert, - ccl_global const float *buffer, - float *pixel_rgba) { - processor(kfilm_convert, buffer, pixel_rgba); - pixel_rgba[3] = 1.0f; - }); -} - -/* Common implementation for half4 destination and single channel input pass. */ -template<typename Processor> -ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value( - const KernelFilmConvert *kfilm_convert, - uchar4 *rgba, - float *render_buffer, - int num_pixels, - int width, - int offset, - int stride, - int rgba_offset, - int rgba_stride, - const Processor &processor) -{ - kernel_gpu_film_convert_half_rgba_common_rgba( - kfilm_convert, - rgba, - render_buffer, - num_pixels, - width, - offset, - stride, - rgba_offset, - rgba_stride, - [&processor](const KernelFilmConvert *kfilm_convert, - ccl_global const float *buffer, - float *pixel_rgba) { - float value; - processor(kfilm_convert, buffer, &value); - - pixel_rgba[0] = value; - pixel_rgba[1] = value; - pixel_rgba[2] = value; - pixel_rgba[3] = 1.0f; - }); -} - -#define KERNEL_FILM_CONVERT_PROC(name) \ - ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name - -#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \ - KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \ - (const KernelFilmConvert kfilm_convert, \ - float *pixels, \ - float *render_buffer, \ - int num_pixels, \ - int width, \ - int offset, \ - int stride, \ - int rgba_offset, \ - int rgba_stride) \ + *out = half_pixel; +#endif +} + +#ifdef __KERNEL_METAL__ + +/* Fetch into a local variable on Metal - there is minimal overhead. Templating the + * film_get_pass_pixel_... functions works on MSL, but not on other compilers. */ +# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \ + float local_pixel[4]; \ + film_get_pass_pixel_##variant(&kfilm_convert, buffer, local_pixel); \ + if (input_channel_count >= 1) { \ + pixel[0] = local_pixel[0]; \ + } \ + if (input_channel_count >= 2) { \ + pixel[1] = local_pixel[1]; \ + } \ + if (input_channel_count >= 3) { \ + pixel[2] = local_pixel[2]; \ + } \ + if (input_channel_count >= 4) { \ + pixel[3] = local_pixel[3]; \ + } + +#else + +# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \ + film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); + +#endif + +#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \ + ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \ + ccl_gpu_kernel_signature(film_convert_##variant, \ + const KernelFilmConvert kfilm_convert, \ + ccl_global float *pixels, \ + ccl_global float *render_buffer, \ + int num_pixels, \ + int width, \ + int offset, \ + int stride, \ + int rgba_offset, \ + int rgba_stride) \ { \ - kernel_gpu_film_convert_common(&kfilm_convert, \ - pixels, \ - render_buffer, \ - num_pixels, \ - width, \ - offset, \ - stride, \ - rgba_offset, \ - rgba_stride, \ - film_get_pass_pixel_##variant); \ + const int render_pixel_index = ccl_gpu_global_id_x(); \ + if (render_pixel_index >= num_pixels) { \ + return; \ + } \ +\ + const int x = render_pixel_index % width; \ + const int y = render_pixel_index / width; \ +\ + ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \ + y * stride * kfilm_convert.pass_stride; \ +\ + ccl_global float *pixel = pixels + \ + (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \ +\ + FILM_GET_PASS_PIXEL_F32(variant, input_channel_count); \ } \ - KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \ - (const KernelFilmConvert kfilm_convert, \ - uchar4 *rgba, \ - float *render_buffer, \ - int num_pixels, \ - int width, \ - int offset, \ - int stride, \ - int rgba_offset, \ - int rgba_stride) \ +\ + ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \ + ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \ + const KernelFilmConvert kfilm_convert, \ + ccl_global uchar4 *rgba, \ + ccl_global float *render_buffer, \ + int num_pixels, \ + int width, \ + int offset, \ + int stride, \ + int rgba_offset, \ + int rgba_stride) \ { \ - kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \ - rgba, \ - render_buffer, \ - num_pixels, \ - width, \ - offset, \ - stride, \ - rgba_offset, \ - rgba_stride, \ - film_get_pass_pixel_##variant); \ - } - -KERNEL_FILM_CONVERT_DEFINE(depth, value) -KERNEL_FILM_CONVERT_DEFINE(mist, value) -KERNEL_FILM_CONVERT_DEFINE(sample_count, value) -KERNEL_FILM_CONVERT_DEFINE(float, value) - -KERNEL_FILM_CONVERT_DEFINE(light_path, rgb) -KERNEL_FILM_CONVERT_DEFINE(float3, rgb) - -KERNEL_FILM_CONVERT_DEFINE(motion, rgba) -KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba) -KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba) -KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba) -KERNEL_FILM_CONVERT_DEFINE(combined, rgba) -KERNEL_FILM_CONVERT_DEFINE(float4, rgba) - -#undef KERNEL_FILM_CONVERT_DEFINE -#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE -#undef KERNEL_FILM_CONVERT_PROC + const int render_pixel_index = ccl_gpu_global_id_x(); \ + if (render_pixel_index >= num_pixels) { \ + return; \ + } \ +\ + const int x = render_pixel_index % width; \ + const int y = render_pixel_index / width; \ +\ + ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \ + y * stride * kfilm_convert.pass_stride; \ +\ + float pixel[4]; \ + film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \ +\ + if (input_channel_count == 1) { \ + pixel[1] = pixel[2] = pixel[0]; \ + } \ + if (input_channel_count <= 3) { \ + pixel[3] = 1.0f; \ + } \ +\ + film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \ +\ + const half4 half_pixel = float4_to_half4_display( \ + make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \ + kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); \ + } + +/* 1 channel inputs */ +KERNEL_FILM_CONVERT_VARIANT(depth, 1) +KERNEL_FILM_CONVERT_VARIANT(mist, 1) +KERNEL_FILM_CONVERT_VARIANT(sample_count, 1) +KERNEL_FILM_CONVERT_VARIANT(float, 1) + +/* 3 channel inputs */ +KERNEL_FILM_CONVERT_VARIANT(light_path, 3) +KERNEL_FILM_CONVERT_VARIANT(float3, 3) + +/* 4 channel inputs */ +KERNEL_FILM_CONVERT_VARIANT(motion, 4) +KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4) +KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4) +KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4) +KERNEL_FILM_CONVERT_VARIANT(combined, 4) +KERNEL_FILM_CONVERT_VARIANT(float4, 4) + +#undef KERNEL_FILM_CONVERT_VARIANT /* -------------------------------------------------------------------- * Shader evaluation. @@ -610,28 +687,46 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba) /* Displacement */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input, - float4 *output, - const int offset, - const int work_size) + ccl_gpu_kernel_signature(shader_eval_displace, + ccl_global KernelShaderEvalInput *input, + ccl_global float *output, + const int offset, + const int work_size) +{ + int i = ccl_gpu_global_id_x(); + if (i < work_size) { + ccl_gpu_kernel_call(kernel_displace_evaluate(NULL, input, output, offset + i)); + } +} + +/* Background */ + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + ccl_gpu_kernel_signature(shader_eval_background, + ccl_global KernelShaderEvalInput *input, + ccl_global float *output, + const int offset, + const int work_size) { int i = ccl_gpu_global_id_x(); if (i < work_size) { - kernel_displace_evaluate(NULL, input, output, offset + i); + ccl_gpu_kernel_call(kernel_background_evaluate(NULL, input, output, offset + i)); } } -/* Background Shader Evaluation */ +/* Curve Shadow Transparency */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_shader_eval_background(KernelShaderEvalInput *input, - float4 *output, - const int offset, - const int work_size) + ccl_gpu_kernel_signature(shader_eval_curve_shadow_transparency, + ccl_global KernelShaderEvalInput *input, + ccl_global float *output, + const int offset, + const int work_size) { int i = ccl_gpu_global_id_x(); if (i < work_size) { - kernel_background_evaluate(NULL, input, output, offset + i); + ccl_gpu_kernel_call( + kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i)); } } @@ -640,15 +735,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_filter_color_preprocess(float *render_buffer, - int full_x, - int full_y, - int width, - int height, - int offset, - int stride, - int pass_stride, - int pass_denoised) + ccl_gpu_kernel_signature(filter_color_preprocess, + ccl_global float *render_buffer, + int full_x, + int full_y, + int width, + int height, + int offset, + int stride, + int pass_stride, + int pass_denoised) { const int work_index = ccl_gpu_global_id_x(); const int y = work_index / width; @@ -659,31 +755,34 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride; - float *buffer = render_buffer + render_pixel_index * pass_stride; + ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride; - float *color_out = buffer + pass_denoised; + ccl_global float *color_out = buffer + pass_denoised; color_out[0] = clamp(color_out[0], 0.0f, 10000.0f); color_out[1] = clamp(color_out[1], 0.0f, 10000.0f); color_out[2] = clamp(color_out[2], 0.0f, 10000.0f); } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_filter_guiding_preprocess(float *guiding_buffer, - int guiding_pass_stride, - int guiding_pass_albedo, - int guiding_pass_normal, - const float *render_buffer, - int render_offset, - int render_stride, - int render_pass_stride, - int render_pass_sample_count, - int render_pass_denoising_albedo, - int render_pass_denoising_normal, - int full_x, - int full_y, - int width, - int height, - int num_samples) + ccl_gpu_kernel_signature(filter_guiding_preprocess, + ccl_global float *guiding_buffer, + int guiding_pass_stride, + int guiding_pass_albedo, + int guiding_pass_normal, + int guiding_pass_flow, + ccl_global const float *render_buffer, + int render_offset, + int render_stride, + int render_pass_stride, + int render_pass_sample_count, + int render_pass_denoising_albedo, + int render_pass_denoising_normal, + int render_pass_motion, + int full_x, + int full_y, + int width, + int height, + int num_samples) { const int work_index = ccl_gpu_global_id_x(); const int y = work_index / width; @@ -694,10 +793,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } const uint64_t guiding_pixel_index = x + y * width; - float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; + ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride; - const float *buffer = render_buffer + render_pixel_index * render_pass_stride; + ccl_global const float *buffer = render_buffer + render_pixel_index * render_pass_stride; float pixel_scale; if (render_pass_sample_count == PASS_UNUSED) { @@ -711,8 +810,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (guiding_pass_albedo != PASS_UNUSED) { kernel_assert(render_pass_denoising_albedo != PASS_UNUSED); - const float *aledo_in = buffer + render_pass_denoising_albedo; - float *albedo_out = guiding_pixel + guiding_pass_albedo; + ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo; + ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo; albedo_out[0] = aledo_in[0] * pixel_scale; albedo_out[1] = aledo_in[1] * pixel_scale; @@ -720,24 +819,36 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } /* Normal pass. */ - if (render_pass_denoising_normal != PASS_UNUSED) { + if (guiding_pass_normal != PASS_UNUSED) { kernel_assert(render_pass_denoising_normal != PASS_UNUSED); - const float *normal_in = buffer + render_pass_denoising_normal; - float *normal_out = guiding_pixel + guiding_pass_normal; + ccl_global const float *normal_in = buffer + render_pass_denoising_normal; + ccl_global float *normal_out = guiding_pixel + guiding_pass_normal; normal_out[0] = normal_in[0] * pixel_scale; normal_out[1] = normal_in[1] * pixel_scale; normal_out[2] = normal_in[2] * pixel_scale; } + + /* Flow pass. */ + if (guiding_pass_flow != PASS_UNUSED) { + kernel_assert(render_pass_motion != PASS_UNUSED); + + ccl_global const float *motion_in = buffer + render_pass_motion; + ccl_global float *flow_out = guiding_pixel + guiding_pass_flow; + + flow_out[0] = -motion_in[0] * pixel_scale; + flow_out[1] = -motion_in[1] * pixel_scale; + } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer, - int guiding_pass_stride, - int guiding_pass_albedo, - int width, - int height) + ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo, + ccl_global float *guiding_buffer, + int guiding_pass_stride, + int guiding_pass_albedo, + int width, + int height) { kernel_assert(guiding_pass_albedo != PASS_UNUSED); @@ -750,9 +861,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } const uint64_t guiding_pixel_index = x + y * width; - float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; + ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; - float *albedo_out = guiding_pixel + guiding_pass_albedo; + ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo; albedo_out[0] = 0.5f; albedo_out[1] = 0.5f; @@ -760,20 +871,21 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_filter_color_postprocess(float *render_buffer, - int full_x, - int full_y, - int width, - int height, - int offset, - int stride, - int pass_stride, - int num_samples, - int pass_noisy, - int pass_denoised, - int pass_sample_count, - int num_components, - bool use_compositing) + ccl_gpu_kernel_signature(filter_color_postprocess, + ccl_global float *render_buffer, + int full_x, + int full_y, + int width, + int height, + int offset, + int stride, + int pass_stride, + int num_samples, + int pass_noisy, + int pass_denoised, + int pass_sample_count, + int num_components, + bool use_compositing) { const int work_index = ccl_gpu_global_id_x(); const int y = work_index / width; @@ -784,7 +896,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride; - float *buffer = render_buffer + render_pixel_index * pass_stride; + ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride; float pixel_scale; if (pass_sample_count == PASS_UNUSED) { @@ -794,7 +906,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) pixel_scale = __float_as_uint(buffer[pass_sample_count]); } - float *denoised_pixel = buffer + pass_denoised; + ccl_global float *denoised_pixel = buffer + pass_denoised; denoised_pixel[0] *= pixel_scale; denoised_pixel[1] *= pixel_scale; @@ -807,13 +919,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) /* Currently compositing passes are either 3-component (derived by dividing light passes) * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it * simplifies logic and avoids extra memory allocation. */ - const float *noisy_pixel = buffer + pass_noisy; + ccl_global const float *noisy_pixel = buffer + pass_noisy; denoised_pixel[3] = noisy_pixel[3]; } else { /* Assigning to zero since this is a default alpha value for 3-component passes, and it * is an opaque pixel for 4 component passes. */ - denoised_pixel[3] = 0; } } @@ -823,21 +934,22 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states, - uint *num_possible_splits) + ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits, + int num_states, + ccl_global uint *num_possible_splits) { const int state = ccl_gpu_global_id_x(); bool can_split = false; if (state < num_states) { - can_split = kernel_shadow_catcher_path_can_split(nullptr, state); + can_split = ccl_gpu_kernel_call(kernel_shadow_catcher_path_can_split(nullptr, state)); } /* NOTE: All threads specified in the mask must execute the intrinsic. */ - const uint can_split_mask = ccl_gpu_ballot(can_split); + const auto can_split_mask = ccl_gpu_ballot(can_split); const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size; if (lane_id == 0) { - atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask)); + atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask)); } } diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index db4a4bf71e0..a5320edcb3c 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN * * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */ -#include "util/util_atomic.h" +#include "util/atomic.h" #ifdef __HIP__ # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024 @@ -31,10 +31,43 @@ CCL_NAMESPACE_BEGIN # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 #endif +#ifdef __KERNEL_METAL__ +struct ActiveIndexContext { + ActiveIndexContext(int _thread_index, + int _global_index, + int _threadgroup_size, + int _simdgroup_size, + int _simd_lane_index, + int _simd_group_index, + int _num_simd_groups, + threadgroup int *_simdgroup_offset) + : thread_index(_thread_index), + global_index(_global_index), + blocksize(_threadgroup_size), + ccl_gpu_warp_size(_simdgroup_size), + thread_warp(_simd_lane_index), + warp_index(_simd_group_index), + num_warps(_num_simd_groups), + warp_offset(_simdgroup_offset) + { + } + + const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index, + num_warps; + threadgroup int *warp_offset; + + template<uint blocksizeDummy, typename IsActiveOp> + void active_index_array(const uint num_states, + ccl_global int *indices, + ccl_global int *num_indices, + IsActiveOp is_active_op) + { + const uint state_index = global_index; +#else template<uint blocksize, typename IsActiveOp> __device__ void gpu_parallel_active_index_array(const uint num_states, - int *indices, - int *num_indices, + ccl_global int *indices, + ccl_global int *num_indices, IsActiveOp is_active_op) { extern ccl_gpu_shared int warp_offset[]; @@ -45,43 +78,62 @@ __device__ void gpu_parallel_active_index_array(const uint num_states, const uint warp_index = thread_index / ccl_gpu_warp_size; const uint num_warps = blocksize / ccl_gpu_warp_size; - /* Test if state corresponding to this thread is active. */ const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index; - const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; +#endif - /* For each thread within a warp compute how many other active states precede it. */ - const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp); - const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask); + /* Test if state corresponding to this thread is active. */ + const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; - /* Last thread in warp stores number of active states for each warp. */ - if (thread_warp == ccl_gpu_warp_size - 1) { - warp_offset[warp_index] = thread_offset + is_active; - } + /* For each thread within a warp compute how many other active states precede it. */ + const uint thread_offset = popcount(ccl_gpu_ballot(is_active) & + ccl_gpu_thread_mask(thread_warp)); - ccl_gpu_syncthreads(); - - /* Last thread in block converts per-warp sizes to offsets, increments global size of - * index array and gets offset to write to. */ - if (thread_index == blocksize - 1) { - /* TODO: parallelize this. */ - int offset = 0; - for (int i = 0; i < num_warps; i++) { - int num_active = warp_offset[i]; - warp_offset[i] = offset; - offset += num_active; + /* Last thread in warp stores number of active states for each warp. */ + if (thread_warp == ccl_gpu_warp_size - 1) { + warp_offset[warp_index] = thread_offset + is_active; } - const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active; - warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); - } + ccl_gpu_syncthreads(); + + /* Last thread in block converts per-warp sizes to offsets, increments global size of + * index array and gets offset to write to. */ + if (thread_index == blocksize - 1) { + /* TODO: parallelize this. */ + int offset = 0; + for (int i = 0; i < num_warps; i++) { + int num_active = warp_offset[i]; + warp_offset[i] = offset; + offset += num_active; + } + + const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active; + warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); + } - ccl_gpu_syncthreads(); + ccl_gpu_syncthreads(); - /* Write to index array. */ - if (is_active) { - const uint block_offset = warp_offset[num_warps]; - indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index; + /* Write to index array. */ + if (is_active) { + const uint block_offset = warp_offset[num_warps]; + indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index; + } } -} + +#ifdef __KERNEL_METAL__ +}; /* end class ActiveIndexContext */ + +/* inject the required thread params into a struct, and redirect to its templated member function + */ +# define gpu_parallel_active_index_array \ + ActiveIndexContext(metal_local_id, \ + metal_global_id, \ + metal_local_size, \ + simdgroup_size, \ + simd_lane_index, \ + simd_group_index, \ + num_simd_groups, \ + simdgroup_offset) \ + .active_index_array +#endif CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h index a1349e82efb..4bd002c27e4 100644 --- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h +++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h @@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN * This is used for an array the size of the number of shaders in the scene * which is not usually huge, so might not be a significant bottleneck. */ -#include "util/util_atomic.h" +#include "util/atomic.h" #ifdef __HIP__ # define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024 @@ -33,16 +33,20 @@ CCL_NAMESPACE_BEGIN # define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 #endif -template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values) +__device__ void gpu_parallel_prefix_sum(const int global_id, + ccl_global int *counter, + ccl_global int *prefix_sum, + const int num_values) { - if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) { + if (global_id != 0) { return; } int offset = 0; for (int i = 0; i < num_values; i++) { - const int new_offset = offset + values[i]; - values[i] = offset; + const int new_offset = offset + counter[i]; + prefix_sum[i] = offset; + counter[i] = 0; offset = new_offset; } } diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h index 9bca1fad22f..c092e2a21ee 100644 --- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h @@ -24,7 +24,7 @@ CCL_NAMESPACE_BEGIN * * TODO: there may be ways to optimize this to avoid this many atomic ops? */ -#include "util/util_atomic.h" +#include "util/atomic.h" #ifdef __HIP__ # define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024 @@ -33,20 +33,30 @@ CCL_NAMESPACE_BEGIN #endif #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0) -template<uint blocksize, typename GetKeyOp> -__device__ void gpu_parallel_sorted_index_array(const uint num_states, - int *indices, - int *num_indices, - int *key_prefix_sum, +template<typename GetKeyOp> +__device__ void gpu_parallel_sorted_index_array(const uint state_index, + const uint num_states, + const int num_states_limit, + ccl_global int *indices, + ccl_global int *num_indices, + ccl_global int *key_counter, + ccl_global int *key_prefix_sum, GetKeyOp get_key_op) { - const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x; const int key = (state_index < num_states) ? get_key_op(state_index) : GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY; if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) { const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1); - indices[index] = state_index; + if (index < num_states_limit) { + /* Assign state index. */ + indices[index] = state_index; + } + else { + /* Can't process this state now, increase the counter again so that + * it will be handled in another iteration. */ + atomic_fetch_and_add_uint32(&key_counter[key], 1); + } } } diff --git a/intern/cycles/kernel/device/gpu/work_stealing.h b/intern/cycles/kernel/device/gpu/work_stealing.h new file mode 100644 index 00000000000..c3083948057 --- /dev/null +++ b/intern/cycles/kernel/device/gpu/work_stealing.h @@ -0,0 +1,55 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +CCL_NAMESPACE_BEGIN + +/* + * Utility functions for work stealing + */ + +/* Map global work index to tile, pixel X/Y and sample. */ +ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile, + uint global_work_index, + ccl_private uint *x, + ccl_private uint *y, + ccl_private uint *sample) +{ + uint sample_offset, pixel_offset; + + if (kernel_data.integrator.scrambling_distance < 0.9f) { + /* Keep threads for the same sample together. */ + uint tile_pixels = tile->w * tile->h; + sample_offset = global_work_index / tile_pixels; + pixel_offset = global_work_index - sample_offset * tile_pixels; + } + else { + /* Keeping threads for the same pixel together. + * Appears to improve performance by a few % on CUDA and OptiX. */ + sample_offset = global_work_index % tile->num_samples; + pixel_offset = global_work_index / tile->num_samples; + } + + uint y_offset = pixel_offset / tile->w; + uint x_offset = pixel_offset - y_offset * tile->w; + + *x = tile->x + x_offset; + *y = tile->y + y_offset; + *sample = tile->start_sample + sample_offset; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h index 95338fe7d6e..fff7a09e884 100644 --- a/intern/cycles/kernel/device/hip/compat.h +++ b/intern/cycles/kernel/device/hip/compat.h @@ -45,14 +45,14 @@ typedef unsigned long long uint64_t; #define ccl_device_forceinline __device__ __forceinline__ #define ccl_device_noinline __device__ __noinline__ #define ccl_device_noinline_cpu ccl_device +#define ccl_device_inline_method ccl_device #define ccl_global -#define ccl_static_constant __constant__ +#define ccl_inline_constant __constant__ #define ccl_device_constant __constant__ __device__ #define ccl_constant const #define ccl_gpu_shared __shared__ #define ccl_private #define ccl_may_alias -#define ccl_addr_space #define ccl_restrict __restrict__ #define ccl_loop_no_unroll #define ccl_align(n) __align__(n) @@ -75,6 +75,7 @@ typedef unsigned long long uint64_t; #define ccl_gpu_block_idx_x (blockIdx.x) #define ccl_gpu_grid_dim_x (gridDim.x) #define ccl_gpu_warp_size (warpSize) +#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) @@ -84,7 +85,6 @@ typedef unsigned long long uint64_t; #define ccl_gpu_syncthreads() __syncthreads() #define ccl_gpu_ballot(predicate) __ballot(predicate) #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla) -#define ccl_gpu_popc(x) __popc(x) /* GPU texture objects */ typedef hipTextureObject_t ccl_gpu_tex_object; @@ -117,5 +117,5 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object tex /* Types */ -#include "util/util_half.h" -#include "util/util_types.h" +#include "util/half.h" +#include "util/types.h" diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h index 2fde0d46015..7ec744d8ad2 100644 --- a/intern/cycles/kernel/device/hip/config.h +++ b/intern/cycles/kernel/device/hip/config.h @@ -35,12 +35,29 @@ /* Compute number of threads per block and minimum blocks per multiprocessor * given the maximum number of registers per thread. */ - #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \ extern "C" __global__ void __launch_bounds__(block_num_threads, \ GPU_MULTIPRESSOR_MAX_REGISTERS / \ (block_num_threads * thread_num_registers)) +#define ccl_gpu_kernel_threads(block_num_threads) \ + extern "C" __global__ void __launch_bounds__(block_num_threads) + +#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__) + +#define ccl_gpu_kernel_call(x) x + +/* Define a function object where "func" is the lambda body, and additional parameters are used to + * specify captured state */ +#define ccl_gpu_kernel_lambda(func, ...) \ + struct KernelLambda { \ + __VA_ARGS__; \ + __device__ int operator()(const int state) \ + { \ + return (func); \ + } \ + } ccl_gpu_kernel_lambda_pass + /* sanity checks */ #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS diff --git a/intern/cycles/kernel/device/hip/globals.h b/intern/cycles/kernel/device/hip/globals.h index 39978ae7899..d9a560d668b 100644 --- a/intern/cycles/kernel/device/hip/globals.h +++ b/intern/cycles/kernel/device/hip/globals.h @@ -18,24 +18,25 @@ #pragma once -#include "kernel/kernel_profiling.h" -#include "kernel/kernel_types.h" +#include "kernel/types.h" -#include "kernel/integrator/integrator_state.h" +#include "kernel/integrator/state.h" + +#include "kernel/util/profiling.h" CCL_NAMESPACE_BEGIN /* Not actually used, just a NULL pointer that gets passed everywhere, which we * hope gets optimized out by the compiler. */ -struct KernelGlobals { - /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */ +struct KernelGlobalsGPU { int unused[1]; }; +typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals; /* Global scene data and textures */ __constant__ KernelData __data; #define KERNEL_TEX(type, name) __attribute__((used)) const __constant__ __device__ type *name; -#include "kernel/kernel_textures.h" +#include "kernel/textures.h" /* Integrator state */ __constant__ IntegratorStateGPU __integrator_state; diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h new file mode 100644 index 00000000000..1222b68f0ee --- /dev/null +++ b/intern/cycles/kernel/device/metal/compat.h @@ -0,0 +1,316 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#define __KERNEL_GPU__ +#define __KERNEL_METAL__ +#define CCL_NAMESPACE_BEGIN +#define CCL_NAMESPACE_END + +#ifndef ATTR_FALLTHROUGH +# define ATTR_FALLTHROUGH +#endif + +#include <metal_atomic> +#include <metal_pack> +#include <metal_stdlib> +#include <simd/simd.h> + +using namespace metal; + +#ifdef __METALRT__ +using namespace metal::raytracing; +#endif + +#pragma clang diagnostic ignored "-Wunused-variable" +#pragma clang diagnostic ignored "-Wsign-compare" +#pragma clang diagnostic ignored "-Wuninitialized" + +/* Qualifiers */ + +#define ccl_device +#define ccl_device_inline ccl_device +#define ccl_device_forceinline ccl_device +#define ccl_device_noinline ccl_device __attribute__((noinline)) +#define ccl_device_noinline_cpu ccl_device +#define ccl_device_inline_method ccl_device +#define ccl_global device +#define ccl_inline_constant static constant constexpr +#define ccl_device_constant constant +#define ccl_constant constant +#define ccl_gpu_shared threadgroup +#define ccl_private thread +#define ccl_may_alias +#define ccl_restrict __restrict +#define ccl_loop_no_unroll +#define ccl_align(n) alignas(n) +#define ccl_optional_struct_init + +/* No assert supported for Metal */ + +#define kernel_assert(cond) + +#define ccl_gpu_global_id_x() metal_global_id +#define ccl_gpu_warp_size simdgroup_size +#define ccl_gpu_thread_idx_x simd_group_index +#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1) + +#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate))) +#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup); + +// clang-format off + +/* kernel.h adapters */ + +#define ccl_gpu_kernel(block_num_threads, thread_num_registers) +#define ccl_gpu_kernel_threads(block_num_threads) + +/* Convert a comma-separated list into a semicolon-separated list + * (so that we can generate a struct based on kernel entry-point parameters). */ +#define FN0() +#define FN1(p1) p1; +#define FN2(p1, p2) p1; p2; +#define FN3(p1, p2, p3) p1; p2; p3; +#define FN4(p1, p2, p3, p4) p1; p2; p3; p4; +#define FN5(p1, p2, p3, p4, p5) p1; p2; p3; p4; p5; +#define FN6(p1, p2, p3, p4, p5, p6) p1; p2; p3; p4; p5; p6; +#define FN7(p1, p2, p3, p4, p5, p6, p7) p1; p2; p3; p4; p5; p6; p7; +#define FN8(p1, p2, p3, p4, p5, p6, p7, p8) p1; p2; p3; p4; p5; p6; p7; p8; +#define FN9(p1, p2, p3, p4, p5, p6, p7, p8, p9) p1; p2; p3; p4; p5; p6; p7; p8; p9; +#define FN10(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; +#define FN11(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; +#define FN12(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; +#define FN13(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; +#define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; +#define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; +#define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; +#define FN17(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; +#define FN18(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; +#define FN19(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19; +#define FN20(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19; p20; +#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, ...) p20 +#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN20, FN19, FN18, FN17, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0) + +/* Generate a struct containing the entry-point parameters and a "run" + * method which can access them implicitly via this-> */ +#define ccl_gpu_kernel_signature(name, ...) \ +struct kernel_gpu_##name \ +{ \ + PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \ + void run(thread MetalKernelContext& context, \ + threadgroup int *simdgroup_offset, \ + const uint metal_global_id, \ + const ushort metal_local_id, \ + const ushort metal_local_size, \ + uint simdgroup_size, \ + uint simd_lane_index, \ + uint simd_group_index, \ + uint num_simd_groups) ccl_global const; \ +}; \ +kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \ + constant KernelParamsMetal &ccl_restrict _launch_params_metal, \ + constant MetalAncillaries *_metal_ancillaries, \ + threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \ + const uint metal_global_id [[thread_position_in_grid]], \ + const ushort metal_local_id [[thread_position_in_threadgroup]], \ + const ushort metal_local_size [[threads_per_threadgroup]], \ + uint simdgroup_size [[threads_per_simdgroup]], \ + uint simd_lane_index [[thread_index_in_simdgroup]], \ + uint simd_group_index [[simdgroup_index_in_threadgroup]], \ + uint num_simd_groups [[simdgroups_per_threadgroup]]) { \ + MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \ + params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \ +} \ +void kernel_gpu_##name::run(thread MetalKernelContext& context, \ + threadgroup int *simdgroup_offset, \ + const uint metal_global_id, \ + const ushort metal_local_id, \ + const ushort metal_local_size, \ + uint simdgroup_size, \ + uint simd_lane_index, \ + uint simd_group_index, \ + uint num_simd_groups) ccl_global const + +#define ccl_gpu_kernel_call(x) context.x + +/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state */ +#define ccl_gpu_kernel_lambda(func, ...) \ + struct KernelLambda \ + { \ + KernelLambda(ccl_private MetalKernelContext &_context) : context(_context) {} \ + ccl_private MetalKernelContext &context; \ + __VA_ARGS__; \ + int operator()(const int state) const { return (func); } \ + } ccl_gpu_kernel_lambda_pass(context) + +// clang-format on + +/* volumetric lambda functions - use function objects for lambda-like functionality */ +#define VOLUME_READ_LAMBDA(function_call) \ + struct FnObjectRead { \ + KernelGlobals kg; \ + ccl_private MetalKernelContext *context; \ + int state; \ +\ + VolumeStack operator()(const int i) const \ + { \ + return context->function_call; \ + } \ + } volume_read_lambda_pass{kg, this, state}; + +#define VOLUME_WRITE_LAMBDA(function_call) \ + struct FnObjectWrite { \ + KernelGlobals kg; \ + ccl_private MetalKernelContext *context; \ + int state; \ +\ + void operator()(const int i, VolumeStack entry) const \ + { \ + context->function_call; \ + } \ + } volume_write_lambda_pass{kg, this, state}; + +/* make_type definitions with Metal style element initializers */ +#ifdef make_float2 +# undef make_float2 +#endif +#ifdef make_float3 +# undef make_float3 +#endif +#ifdef make_float4 +# undef make_float4 +#endif +#ifdef make_int2 +# undef make_int2 +#endif +#ifdef make_int3 +# undef make_int3 +#endif +#ifdef make_int4 +# undef make_int4 +#endif +#ifdef make_uchar4 +# undef make_uchar4 +#endif + +#define make_float2(x, y) float2(x, y) +#define make_float3(x, y, z) float3(x, y, z) +#define make_float4(x, y, z, w) float4(x, y, z, w) +#define make_int2(x, y) int2(x, y) +#define make_int3(x, y, z) int3(x, y, z) +#define make_int4(x, y, z, w) int4(x, y, z, w) +#define make_uchar4(x, y, z, w) uchar4(x, y, z, w) + +/* Math functions */ + +#define __uint_as_float(x) as_type<float>(x) +#define __float_as_uint(x) as_type<uint>(x) +#define __int_as_float(x) as_type<float>(x) +#define __float_as_int(x) as_type<int>(x) +#define __float2half(x) half(x) +#define powf(x, y) pow(float(x), float(y)) +#define fabsf(x) fabs(float(x)) +#define copysignf(x, y) copysign(float(x), float(y)) +#define asinf(x) asin(float(x)) +#define acosf(x) acos(float(x)) +#define atanf(x) atan(float(x)) +#define floorf(x) floor(float(x)) +#define ceilf(x) ceil(float(x)) +#define hypotf(x, y) hypot(float(x), float(y)) +#define atan2f(x, y) atan2(float(x), float(y)) +#define fmaxf(x, y) fmax(float(x), float(y)) +#define fminf(x, y) fmin(float(x), float(y)) +#define fmodf(x, y) fmod(float(x), float(y)) +#define sinhf(x) sinh(float(x)) +#define coshf(x) cosh(float(x)) +#define tanhf(x) tanh(float(x)) +#define saturatef(x) saturate(float(x)) + +/* Use native functions with possibly lower precision for performance, + * no issues found so far. */ +#define trigmode fast +#define sinf(x) trigmode::sin(float(x)) +#define cosf(x) trigmode::cos(float(x)) +#define tanf(x) trigmode::tan(float(x)) +#define expf(x) trigmode::exp(float(x)) +#define sqrtf(x) trigmode::sqrt(float(x)) +#define logf(x) trigmode::log(float(x)) + +#define NULL 0 + +#define __device__ + +#ifdef __METALRT__ + +# define __KERNEL_GPU_RAYTRACING__ + +# if defined(__METALRT_MOTION__) +# define METALRT_TAGS instancing, instance_motion, primitive_motion +# else +# define METALRT_TAGS instancing +# endif /* __METALRT_MOTION__ */ + +typedef acceleration_structure<METALRT_TAGS> metalrt_as_type; +typedef intersection_function_table<triangle_data, METALRT_TAGS> metalrt_ift_type; +typedef metal::raytracing::intersector<triangle_data, METALRT_TAGS> metalrt_intersector_type; + +#endif /* __METALRT__ */ + +/* texture bindings and sampler setup */ + +struct Texture2DParamsMetal { + texture2d<float, access::sample> tex; +}; +struct Texture3DParamsMetal { + texture3d<float, access::sample> tex; +}; + +struct MetalAncillaries { + device Texture2DParamsMetal *textures_2d; + device Texture3DParamsMetal *textures_3d; + +#ifdef __METALRT__ + metalrt_as_type accel_struct; + metalrt_ift_type ift_default; + metalrt_ift_type ift_shadow; + metalrt_ift_type ift_local; +#endif +}; + +#include "util/half.h" +#include "util/types.h" + +enum SamplerType { + SamplerFilterNearest_AddressRepeat, + SamplerFilterNearest_AddressClampEdge, + SamplerFilterNearest_AddressClampZero, + + SamplerFilterLinear_AddressRepeat, + SamplerFilterLinear_AddressClampEdge, + SamplerFilterLinear_AddressClampZero, + + SamplerCount +}; + +constant constexpr array<sampler, SamplerCount> metal_samplers = { + sampler(address::repeat, filter::nearest), + sampler(address::clamp_to_edge, filter::nearest), + sampler(address::clamp_to_zero, filter::nearest), + sampler(address::repeat, filter::linear), + sampler(address::clamp_to_edge, filter::linear), + sampler(address::clamp_to_zero, filter::linear), +}; diff --git a/intern/cycles/kernel/device/metal/context_begin.h b/intern/cycles/kernel/device/metal/context_begin.h new file mode 100644 index 00000000000..2eefd795aa1 --- /dev/null +++ b/intern/cycles/kernel/device/metal/context_begin.h @@ -0,0 +1,83 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// clang-format off + +/* Open the Metal kernel context class + * Necessary to access resource bindings */ +class MetalKernelContext { + public: + constant KernelParamsMetal &launch_params_metal; + constant MetalAncillaries *metal_ancillaries; + + MetalKernelContext(constant KernelParamsMetal &_launch_params_metal, constant MetalAncillaries * _metal_ancillaries) + : launch_params_metal(_launch_params_metal), metal_ancillaries(_metal_ancillaries) + {} + + MetalKernelContext(constant KernelParamsMetal &_launch_params_metal) + : launch_params_metal(_launch_params_metal) + {} + + /* texture fetch adapter functions */ + typedef uint64_t ccl_gpu_tex_object; + + template<typename T> + inline __attribute__((__always_inline__)) + T ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const { + kernel_assert(0); + return 0; + } + template<typename T> + inline __attribute__((__always_inline__)) + T ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const { + kernel_assert(0); + return 0; + } + + // texture2d + template<> + inline __attribute__((__always_inline__)) + float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const { + const uint tid(tex); + const uint sid(tex >> 32); + return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)); + } + template<> + inline __attribute__((__always_inline__)) + float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const { + const uint tid(tex); + const uint sid(tex >> 32); + return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x; + } + + // texture3d + template<> + inline __attribute__((__always_inline__)) + float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const { + const uint tid(tex); + const uint sid(tex >> 32); + return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)); + } + template<> + inline __attribute__((__always_inline__)) + float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const { + const uint tid(tex); + const uint sid(tex >> 32); + return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x; + } +# include "kernel/device/gpu/image.h" + + // clang-format on diff --git a/intern/cycles/kernel/device/metal/context_end.h b/intern/cycles/kernel/device/metal/context_end.h new file mode 100644 index 00000000000..e700f294440 --- /dev/null +++ b/intern/cycles/kernel/device/metal/context_end.h @@ -0,0 +1,23 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +} +; /* end of MetalKernelContext class definition */ + +/* Silently redirect into the MetalKernelContext instance */ +/* NOTE: These macros will need maintaining as entry-points change. */ + +#undef kernel_integrator_state +#define kernel_integrator_state context.launch_params_metal.__integrator_state diff --git a/intern/cycles/kernel/device/metal/globals.h b/intern/cycles/kernel/device/metal/globals.h new file mode 100644 index 00000000000..1aea36589d0 --- /dev/null +++ b/intern/cycles/kernel/device/metal/globals.h @@ -0,0 +1,51 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Constant Globals */ + +#include "kernel/types.h" +#include "kernel/util/profiling.h" + +#include "kernel/integrator/state.h" + +CCL_NAMESPACE_BEGIN + +typedef struct KernelParamsMetal { + +#define KERNEL_TEX(type, name) ccl_global const type *name; +#include "kernel/textures.h" +#undef KERNEL_TEX + + const IntegratorStateGPU __integrator_state; + const KernelData data; + +} KernelParamsMetal; + +typedef struct KernelGlobalsGPU { + int unused[1]; +} KernelGlobalsGPU; + +typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals; + +#define kernel_data launch_params_metal.data +#define kernel_integrator_state launch_params_metal.__integrator_state + +/* data lookup defines */ + +#define kernel_tex_fetch(tex, index) launch_params_metal.tex[index] +#define kernel_tex_array(tex) launch_params_metal.tex + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal new file mode 100644 index 00000000000..6b77940660f --- /dev/null +++ b/intern/cycles/kernel/device/metal/kernel.metal @@ -0,0 +1,765 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Metal kernel entry points */ + +#include "kernel/device/metal/compat.h" +#include "kernel/device/metal/globals.h" +#include "kernel/device/gpu/kernel.h" + +/* MetalRT intersection handlers */ +#ifdef __METALRT__ + +/* Return type for a bounding box intersection function. */ +struct BoundingBoxIntersectionResult +{ + bool accept [[accept_intersection]]; + bool continue_search [[continue_search]]; + float distance [[distance]]; +}; + +/* Return type for a triangle intersection function. */ +struct TriangleIntersectionResult +{ + bool accept [[accept_intersection]]; + bool continue_search [[continue_search]]; +}; + +enum { METALRT_HIT_TRIANGLE, METALRT_HIT_BOUNDING_BOX }; + +ccl_device_inline bool intersection_skip_self(ray_data const RaySelfPrimitives& self, + const int object, + const int prim) +{ + return (self.prim == prim) && (self.object == object); +} + +ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimitives& self, + const int object, + const int prim) +{ + return ((self.prim == prim) && (self.object == object)) || + ((self.light_prim == prim) && (self.light_object == object)); +} + +ccl_device_inline bool intersection_skip_self_local(ray_data const RaySelfPrimitives& self, + const int prim) +{ + return (self.prim == prim); +} + +template<typename TReturn, uint intersection_type> +TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal, + ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload, + const uint object, + const uint primitive_id, + const float2 barycentrics, + const float ray_tmax) +{ + TReturn result; + +#ifdef __BVH_LOCAL__ + uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object); + + if ((object != payload.local_object) || intersection_skip_self_local(payload.self, prim)) { + /* Only intersect with matching object and skip self-intersecton. */ + result.accept = false; + result.continue_search = true; + return result; + } + + const short max_hits = payload.max_hits; + if (max_hits == 0) { + /* Special case for when no hit information is requested, just report that something was hit */ + payload.result = true; + result.accept = true; + result.continue_search = false; + return result; + } + + int hit = 0; + if (payload.has_lcg_state) { + for (short i = min(max_hits, short(payload.local_isect.num_hits)) - 1; i >= 0; --i) { + if (ray_tmax == payload.local_isect.hits[i].t) { + result.accept = false; + result.continue_search = true; + return result; + } + } + + hit = payload.local_isect.num_hits++; + + if (payload.local_isect.num_hits > max_hits) { + hit = lcg_step_uint(&payload.lcg_state) % payload.local_isect.num_hits; + if (hit >= max_hits) { + result.accept = false; + result.continue_search = true; + return result; + } + } + } + else { + if (payload.local_isect.num_hits && ray_tmax > payload.local_isect.hits[0].t) { + /* Record closest intersection only. Do not terminate ray here, since there is no guarantee about distance ordering in any-hit */ + result.accept = false; + result.continue_search = true; + return result; + } + + payload.local_isect.num_hits = 1; + } + + ray_data Intersection *isect = &payload.local_isect.hits[hit]; + isect->t = ray_tmax; + isect->prim = prim; + isect->object = object; + isect->type = kernel_tex_fetch(__objects, object).primitive_type; + + isect->u = 1.0f - barycentrics.y - barycentrics.x; + isect->v = barycentrics.x; + + /* Record geometric normal */ + const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect->prim).w; + const float3 tri_a = float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0)); + const float3 tri_b = float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1)); + const float3 tri_c = float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2)); + payload.local_isect.Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a)); + + /* Continue tracing (without this the trace call would return after the first hit) */ + result.accept = false; + result.continue_search = true; + return result; +#endif +} + +[[intersection(triangle, triangle_data, METALRT_TAGS)]] +TriangleIntersectionResult +__anyhit__cycles_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], + ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]], + uint instance_id [[user_instance_id]], + uint primitive_id [[primitive_id]], + float2 barycentrics [[barycentric_coord]], + float ray_tmax [[distance]]) +{ + return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>( + launch_params_metal, payload, instance_id, primitive_id, barycentrics, ray_tmax); +} + +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] +BoundingBoxIntersectionResult +__anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]]) +{ + /* unused function */ + BoundingBoxIntersectionResult result; + result.distance = ray_tmax; + result.accept = false; + result.continue_search = false; + return result; +} + +template<uint intersection_type> +bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal, + ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload, + uint object, + uint prim, + const float2 barycentrics, + const float ray_tmax) +{ +#ifdef __SHADOW_RECORD_ALL__ +# ifdef __VISIBILITY_FLAG__ + const uint visibility = payload.visibility; + if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) { + /* continue search */ + return true; + } +# endif + + if (intersection_skip_self_shadow(payload.self, object, prim)) { + /* continue search */ + return true; + } + + float u = 0.0f, v = 0.0f; + int type = 0; + if (intersection_type == METALRT_HIT_TRIANGLE) { + u = 1.0f - barycentrics.y - barycentrics.x; + v = barycentrics.x; + type = kernel_tex_fetch(__objects, object).primitive_type; + } +# ifdef __HAIR__ + else { + u = barycentrics.x; + v = barycentrics.y; + + const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim); + type = segment.type; + prim = segment.prim; + + /* Filter out curve endcaps */ + if (u == 0.0f || u == 1.0f) { + /* continue search */ + return true; + } + } +# endif + +# ifndef __TRANSPARENT_SHADOWS__ + /* No transparent shadows support compiled in, make opaque. */ + payload.result = true; + /* terminate ray */ + return false; +# else + short max_hits = payload.max_hits; + short num_hits = payload.num_hits; + short num_recorded_hits = payload.num_recorded_hits; + + MetalKernelContext context(launch_params_metal); + + /* If no transparent shadows, all light is blocked and we can stop immediately. */ + if (num_hits >= max_hits || + !(context.intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) { + payload.result = true; + /* terminate ray */ + return false; + } + + /* Always use baked shadow transparency for curves. */ + if (type & PRIMITIVE_CURVE) { + float throughput = payload.throughput; + throughput *= context.intersection_curve_shadow_transparency(nullptr, object, prim, u); + payload.throughput = throughput; + payload.num_hits += 1; + + if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) { + /* Accept result and terminate if throughput is sufficiently low */ + payload.result = true; + return false; + } + else { + return true; + } + } + + payload.num_hits += 1; + payload.num_recorded_hits += 1; + + uint record_index = num_recorded_hits; + + const IntegratorShadowState state = payload.state; + + const uint max_record_hits = min(uint(max_hits), INTEGRATOR_SHADOW_ISECT_SIZE); + if (record_index >= max_record_hits) { + /* If maximum number of hits reached, find a hit to replace. */ + float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t); + uint max_recorded_hit = 0; + + for (int i = 1; i < max_record_hits; i++) { + const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t); + if (isect_t > max_recorded_t) { + max_recorded_t = isect_t; + max_recorded_hit = i; + } + } + + if (ray_tmax >= max_recorded_t) { + /* Accept hit, so that we don't consider any more hits beyond the distance of the + * current hit anymore. */ + payload.result = true; + return true; + } + + record_index = max_recorded_hit; + } + + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = ray_tmax; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type; + + /* Continue tracing. */ +# endif /* __TRANSPARENT_SHADOWS__ */ +#endif /* __SHADOW_RECORD_ALL__ */ + + return true; +} + +[[intersection(triangle, triangle_data, METALRT_TAGS)]] +TriangleIntersectionResult +__anyhit__cycles_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], + ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]], + unsigned int object [[user_instance_id]], + unsigned int primitive_id [[primitive_id]], + float2 barycentrics [[barycentric_coord]], + float ray_tmax [[distance]]) +{ + uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object); + + TriangleIntersectionResult result; + result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_TRIANGLE>( + launch_params_metal, payload, object, prim, barycentrics, ray_tmax); + result.accept = !result.continue_search; + return result; +} + +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] +BoundingBoxIntersectionResult +__anyhit__cycles_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]]) +{ + /* unused function */ + BoundingBoxIntersectionResult result; + result.distance = ray_tmax; + result.accept = false; + result.continue_search = false; + return result; +} + +template<typename TReturnType, uint intersection_type> +inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_params_metal, + ray_data MetalKernelContext::MetalRTIntersectionPayload &payload, + const uint object, + const uint prim, + const float u) +{ + TReturnType result; + +# ifdef __HAIR__ + if (intersection_type == METALRT_HIT_BOUNDING_BOX) { + /* Filter out curve endcaps. */ + if (u == 0.0f || u == 1.0f) { + result.accept = false; + result.continue_search = true; + return result; + } + } +# endif + + uint visibility = payload.visibility; +# ifdef __VISIBILITY_FLAG__ + if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) { + result.accept = false; + result.continue_search = true; + return result; + } +# endif + + /* Shadow ray early termination. */ + if (visibility & PATH_RAY_SHADOW_OPAQUE) { + if (intersection_skip_self_shadow(payload.self, object, prim)) { + result.accept = false; + result.continue_search = true; + return result; + } + else { + result.accept = true; + result.continue_search = false; + return result; + } + } + else { + if (intersection_skip_self(payload.self, object, prim)) { + result.accept = false; + result.continue_search = true; + return result; + } + } + + result.accept = true; + result.continue_search = true; + return result; +} + +[[intersection(triangle, triangle_data, METALRT_TAGS)]] +TriangleIntersectionResult +__anyhit__cycles_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], + ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]], + unsigned int object [[user_instance_id]], + unsigned int primitive_id [[primitive_id]]) +{ + uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object); + TriangleIntersectionResult result = metalrt_visibility_test<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>( + launch_params_metal, payload, object, prim, 0.0f); + if (result.accept) { + payload.prim = prim; + payload.type = kernel_tex_fetch(__objects, object).primitive_type; + } + return result; +} + +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] +BoundingBoxIntersectionResult +__anyhit__cycles_metalrt_visibility_test_box(const float ray_tmax [[max_distance]]) +{ + /* Unused function */ + BoundingBoxIntersectionResult result; + result.accept = false; + result.continue_search = true; + result.distance = ray_tmax; + return result; +} + +#ifdef __HAIR__ +ccl_device_inline +void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal, + ray_data MetalKernelContext::MetalRTIntersectionPayload &payload, + const uint object, + const uint prim, + const uint type, + const float3 ray_origin, + const float3 ray_direction, + float time, + const float ray_tmax, + thread BoundingBoxIntersectionResult &result) +{ +# ifdef __VISIBILITY_FLAG__ + const uint visibility = payload.visibility; + if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) { + return; + } +# endif + + float3 P = ray_origin; + float3 dir = ray_direction; + + /* The direction is not normalized by default, but the curve intersection routine expects that */ + float len; + dir = normalize_len(dir, &len); + + Intersection isect; + isect.t = ray_tmax; + /* Transform maximum distance into object space. */ + if (isect.t != FLT_MAX) + isect.t *= len; + + MetalKernelContext context(launch_params_metal); + if (context.curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) { + result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>( + launch_params_metal, payload, object, prim, isect.u); + if (result.accept) { + result.distance = isect.t / len; + payload.u = isect.u; + payload.v = isect.v; + payload.prim = prim; + payload.type = type; + } + } +} + +ccl_device_inline +void metalrt_intersection_curve_shadow(constant KernelParamsMetal &launch_params_metal, + ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload, + const uint object, + const uint prim, + const uint type, + const float3 ray_origin, + const float3 ray_direction, + float time, + const float ray_tmax, + thread BoundingBoxIntersectionResult &result) +{ + const uint visibility = payload.visibility; + + float3 P = ray_origin; + float3 dir = ray_direction; + + /* The direction is not normalized by default, but the curve intersection routine expects that */ + float len; + dir = normalize_len(dir, &len); + + Intersection isect; + isect.t = ray_tmax; + /* Transform maximum distance into object space */ + if (isect.t != FLT_MAX) + isect.t *= len; + + MetalKernelContext context(launch_params_metal); + if (context.curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) { + result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>( + launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax); + result.accept = !result.continue_search; + + if (result.accept) { + result.distance = isect.t / len; + } + } +} + +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] +BoundingBoxIntersectionResult +__intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], + ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]], + const uint object [[user_instance_id]], + const uint primitive_id [[primitive_id]], + const float3 ray_origin [[origin]], + const float3 ray_direction [[direction]], + const float ray_tmax [[max_distance]]) +{ + uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object); + const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim); + + BoundingBoxIntersectionResult result; + result.accept = false; + result.continue_search = true; + result.distance = ray_tmax; + + if (segment.type & PRIMITIVE_CURVE_RIBBON) { + metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction, +# if defined(__METALRT_MOTION__) + payload.time, +# else + 0.0f, +# endif + ray_tmax, result); + } + + return result; +} + +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] +BoundingBoxIntersectionResult +__intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], + ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]], + const uint object [[user_instance_id]], + const uint primitive_id [[primitive_id]], + const float3 ray_origin [[origin]], + const float3 ray_direction [[direction]], + const float ray_tmax [[max_distance]]) +{ + uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object); + const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim); + + BoundingBoxIntersectionResult result; + result.accept = false; + result.continue_search = true; + result.distance = ray_tmax; + + if (segment.type & PRIMITIVE_CURVE_RIBBON) { + metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction, +# if defined(__METALRT_MOTION__) + payload.time, +# else + 0.0f, +# endif + ray_tmax, result); + } + + return result; +} + +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] +BoundingBoxIntersectionResult +__intersection__curve_all(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], + ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]], + const uint object [[user_instance_id]], + const uint primitive_id [[primitive_id]], + const float3 ray_origin [[origin]], + const float3 ray_direction [[direction]], + const float ray_tmax [[max_distance]]) +{ + uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object); + const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim); + + BoundingBoxIntersectionResult result; + result.accept = false; + result.continue_search = true; + result.distance = ray_tmax; + metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction, +# if defined(__METALRT_MOTION__) + payload.time, +# else + 0.0f, +# endif + ray_tmax, result); + + return result; +} + +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] +BoundingBoxIntersectionResult +__intersection__curve_all_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], + ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]], + const uint object [[user_instance_id]], + const uint primitive_id [[primitive_id]], + const float3 ray_origin [[origin]], + const float3 ray_direction [[direction]], + const float ray_tmax [[max_distance]]) +{ + uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object); + const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim); + + BoundingBoxIntersectionResult result; + result.accept = false; + result.continue_search = true; + result.distance = ray_tmax; + + metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction, +# if defined(__METALRT_MOTION__) + payload.time, +# else + 0.0f, +# endif + ray_tmax, result); + + return result; +} +#endif /* __HAIR__ */ + +#ifdef __POINTCLOUD__ +ccl_device_inline +void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal, + ray_data MetalKernelContext::MetalRTIntersectionPayload &payload, + const uint object, + const uint prim, + const uint type, + const float3 ray_origin, + const float3 ray_direction, + float time, + const float ray_tmax, + thread BoundingBoxIntersectionResult &result) +{ +# ifdef __VISIBILITY_FLAG__ + const uint visibility = payload.visibility; + if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) { + return; + } +# endif + + float3 P = ray_origin; + float3 dir = ray_direction; + + /* The direction is not normalized by default, but the point intersection routine expects that */ + float len; + dir = normalize_len(dir, &len); + + Intersection isect; + isect.t = ray_tmax; + /* Transform maximum distance into object space. */ + if (isect.t != FLT_MAX) + isect.t *= len; + + MetalKernelContext context(launch_params_metal); + if (context.point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) { + result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>( + launch_params_metal, payload, object, prim, isect.u); + if (result.accept) { + result.distance = isect.t / len; + payload.u = isect.u; + payload.v = isect.v; + payload.prim = prim; + payload.type = type; + } + } +} + +ccl_device_inline +void metalrt_intersection_point_shadow(constant KernelParamsMetal &launch_params_metal, + ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload, + const uint object, + const uint prim, + const uint type, + const float3 ray_origin, + const float3 ray_direction, + float time, + const float ray_tmax, + thread BoundingBoxIntersectionResult &result) +{ + const uint visibility = payload.visibility; + + float3 P = ray_origin; + float3 dir = ray_direction; + + /* The direction is not normalized by default, but the point intersection routine expects that */ + float len; + dir = normalize_len(dir, &len); + + Intersection isect; + isect.t = ray_tmax; + /* Transform maximum distance into object space */ + if (isect.t != FLT_MAX) + isect.t *= len; + + MetalKernelContext context(launch_params_metal); + if (context.point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) { + result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>( + launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax); + result.accept = !result.continue_search; + + if (result.accept) { + result.distance = isect.t / len; + } + } +} + +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] +BoundingBoxIntersectionResult +__intersection__point(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], + ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]], + const uint object [[user_instance_id]], + const uint primitive_id [[primitive_id]], + const float3 ray_origin [[origin]], + const float3 ray_direction [[direction]], + const float ray_tmax [[max_distance]]) +{ + const uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object); + const int type = kernel_tex_fetch(__objects, object).primitive_type; + + BoundingBoxIntersectionResult result; + result.accept = false; + result.continue_search = true; + result.distance = ray_tmax; + + metalrt_intersection_point(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction, +# if defined(__METALRT_MOTION__) + payload.time, +# else + 0.0f, +# endif + ray_tmax, result); + + return result; +} + +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] +BoundingBoxIntersectionResult +__intersection__point_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], + ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]], + const uint object [[user_instance_id]], + const uint primitive_id [[primitive_id]], + const float3 ray_origin [[origin]], + const float3 ray_direction [[direction]], + const float ray_tmax [[max_distance]]) +{ + const uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object); + const int type = kernel_tex_fetch(__objects, object).primitive_type; + + BoundingBoxIntersectionResult result; + result.accept = false; + result.continue_search = true; + result.distance = ray_tmax; + + metalrt_intersection_point_shadow(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction, +# if defined(__METALRT_MOTION__) + payload.time, +# else + 0.0f, +# endif + ray_tmax, result); + + return result; +} +#endif /* __POINTCLOUD__ */ +#endif /* __METALRT__ */ diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h index fb9e094b535..db4233624b9 100644 --- a/intern/cycles/kernel/device/optix/compat.h +++ b/intern/cycles/kernel/device/optix/compat.h @@ -21,6 +21,7 @@ #include <optix.h> #define __KERNEL_GPU__ +#define __KERNEL_GPU_RAYTRACING__ #define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */ #define __KERNEL_OPTIX__ #define CCL_NAMESPACE_BEGIN @@ -49,16 +50,16 @@ typedef unsigned long long uint64_t; __device__ __forceinline__ // Function calls are bad for OptiX performance, so inline everything #define ccl_device_inline ccl_device #define ccl_device_forceinline ccl_device +#define ccl_device_inline_method ccl_device #define ccl_device_noinline __device__ __noinline__ #define ccl_device_noinline_cpu ccl_device #define ccl_global -#define ccl_static_constant __constant__ +#define ccl_inline_constant __constant__ #define ccl_device_constant __constant__ __device__ #define ccl_constant const #define ccl_gpu_shared __shared__ #define ccl_private #define ccl_may_alias -#define ccl_addr_space #define ccl_restrict __restrict__ #define ccl_loop_no_unroll #define ccl_align(n) __align__(n) @@ -77,6 +78,7 @@ typedef unsigned long long uint64_t; #define ccl_gpu_block_idx_x (blockIdx.x) #define ccl_gpu_grid_dim_x (gridDim.x) #define ccl_gpu_warp_size (warpSize) +#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) @@ -86,7 +88,6 @@ typedef unsigned long long uint64_t; #define ccl_gpu_syncthreads() __syncthreads() #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate) #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla) -#define ccl_gpu_popc(x) __popc(x) /* GPU texture objects */ @@ -121,7 +122,14 @@ __device__ half __float2half(const float f) return val; } +__device__ float __half2float(const half h) +{ + float val; + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h)); + return val; +} + /* Types */ -#include "util/util_half.h" -#include "util/util_types.h" +#include "util/half.h" +#include "util/types.h" diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h index 7d898ed5d91..e9b72369cd5 100644 --- a/intern/cycles/kernel/device/optix/globals.h +++ b/intern/cycles/kernel/device/optix/globals.h @@ -18,18 +18,20 @@ #pragma once -#include "kernel/kernel_profiling.h" -#include "kernel/kernel_types.h" +#include "kernel/types.h" -#include "kernel/integrator/integrator_state.h" +#include "kernel/integrator/state.h" + +#include "kernel/util/profiling.h" CCL_NAMESPACE_BEGIN /* Not actually used, just a NULL pointer that gets passed everywhere, which we * hope gets optimized out by the compiler. */ -struct KernelGlobals { +struct KernelGlobalsGPU { int unused[1]; }; +typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals; /* Launch parameters */ struct KernelParamsOptiX { @@ -40,7 +42,7 @@ struct KernelParamsOptiX { /* Global scene data and textures */ KernelData data; #define KERNEL_TEX(type, name) const type *name; -#include "kernel/kernel_textures.h" +#include "kernel/textures.h" /* Integrator state */ IntegratorStateGPU __integrator_state; diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu index 7a79e0c4823..8e3d57bff8a 100644 --- a/intern/cycles/kernel/device/optix/kernel.cu +++ b/intern/cycles/kernel/device/optix/kernel.cu @@ -21,42 +21,44 @@ #include "kernel/device/gpu/image.h" /* Texture lookup uses normal CUDA intrinsics. */ -#include "kernel/integrator/integrator_state.h" -#include "kernel/integrator/integrator_state_flow.h" -#include "kernel/integrator/integrator_state_util.h" +#include "kernel/tables.h" -#include "kernel/integrator/integrator_intersect_closest.h" -#include "kernel/integrator/integrator_intersect_shadow.h" -#include "kernel/integrator/integrator_intersect_subsurface.h" -#include "kernel/integrator/integrator_intersect_volume_stack.h" +#include "kernel/integrator/state.h" +#include "kernel/integrator/state_flow.h" +#include "kernel/integrator/state_util.h" +#include "kernel/integrator/intersect_closest.h" +#include "kernel/integrator/intersect_shadow.h" +#include "kernel/integrator/intersect_subsurface.h" +#include "kernel/integrator/intersect_volume_stack.h" // clang-format on +#define OPTIX_DEFINE_ABI_VERSION_ONLY +#include <optix_function_table.h> + template<typename T> ccl_device_forceinline T *get_payload_ptr_0() { - return (T *)(((uint64_t)optixGetPayload_1() << 32) | optixGetPayload_0()); + return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1()); } template<typename T> ccl_device_forceinline T *get_payload_ptr_2() { - return (T *)(((uint64_t)optixGetPayload_3() << 32) | optixGetPayload_2()); + return pointer_unpack_from_uint<T>(optixGetPayload_2(), optixGetPayload_3()); +} + +template<typename T> ccl_device_forceinline T *get_payload_ptr_6() +{ + return (T *)(((uint64_t)optixGetPayload_7() << 32) | optixGetPayload_6()); } -template<bool always = false> ccl_device_forceinline uint get_object_id() +ccl_device_forceinline int get_object_id() { #ifdef __OBJECT_MOTION__ - /* Always get the the instance ID from the TLAS. + /* Always get the instance ID from the TLAS * There might be a motion transform node between TLAS and BLAS which does not have one. */ - uint object = optixGetInstanceIdFromHandle(optixGetTransformListHandle(0)); + return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0)); #else - uint object = optixGetInstanceId(); + return optixGetInstanceId(); #endif - /* Choose between always returning object ID or only for instances. */ - if (always || (object & 1) == 0) - /* Can just remove the low bit since instance always contains object ID. */ - return object >> 1; - else - /* Set to OBJECT_NONE if this is not an instanced object. */ - return OBJECT_NONE; } extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest() @@ -64,7 +66,7 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest() const int global_index = optixGetLaunchIndex().x; const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] : global_index; - integrator_intersect_closest(nullptr, path_index); + integrator_intersect_closest(nullptr, path_index, __params.render_buffer); } extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow() @@ -100,20 +102,26 @@ extern "C" __global__ void __miss__kernel_optix_miss() extern "C" __global__ void __anyhit__kernel_optix_local_hit() { -#ifdef __HAIR__ +#if defined(__HAIR__) || defined(__POINTCLOUD__) if (!optixIsTriangleHit()) { - /* Ignore curves. */ + /* Ignore curves and points. */ return optixIgnoreIntersection(); } #endif #ifdef __BVH_LOCAL__ - const uint object = get_object_id<true>(); + const int object = get_object_id(); if (object != optixGetPayload_4() /* local_object */) { /* Only intersect with matching object. */ return optixIgnoreIntersection(); } + const int prim = optixGetPrimitiveIndex(); + ccl_private Ray *const ray = get_payload_ptr_6<Ray>(); + if (intersection_skip_self_local(ray->self, prim)) { + return optixIgnoreIntersection(); + } + const uint max_hits = optixGetPayload_5(); if (max_hits == 0) { /* Special case for when no hit information is requested, just report that something was hit */ @@ -154,19 +162,19 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit() Intersection *isect = &local_isect->hits[hit]; isect->t = optixGetRayTmax(); - isect->prim = optixGetPrimitiveIndex(); + isect->prim = prim; isect->object = get_object_id(); - isect->type = kernel_tex_fetch(__prim_type, isect->prim); + isect->type = kernel_tex_fetch(__objects, isect->object).primitive_type; const float2 barycentrics = optixGetTriangleBarycentrics(); isect->u = 1.0f - barycentrics.y - barycentrics.x; isect->v = barycentrics.x; /* Record geometric normal. */ - const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim); - const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0)); - const float3 tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1)); - const float3 tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2)); + const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w; + const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0); + const float3 tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1); + const float3 tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2); local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a)); /* Continue tracing (without this the trace call would return after the first hit). */ @@ -177,167 +185,239 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit() extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit() { #ifdef __SHADOW_RECORD_ALL__ - bool ignore_intersection = false; - - const uint prim = optixGetPrimitiveIndex(); + int prim = optixGetPrimitiveIndex(); + const uint object = get_object_id(); # ifdef __VISIBILITY_FLAG__ const uint visibility = optixGetPayload_4(); - if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) { - ignore_intersection = true; + if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) { + return optixIgnoreIntersection(); } # endif + ccl_private Ray *const ray = get_payload_ptr_6<Ray>(); + if (intersection_skip_self_shadow(ray->self, object, prim)) { + return optixIgnoreIntersection(); + } + float u = 0.0f, v = 0.0f; + int type = 0; if (optixIsTriangleHit()) { const float2 barycentrics = optixGetTriangleBarycentrics(); u = 1.0f - barycentrics.y - barycentrics.x; v = barycentrics.x; + type = kernel_tex_fetch(__objects, object).primitive_type; } # ifdef __HAIR__ - else { + else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) { u = __uint_as_float(optixGetAttribute_0()); v = __uint_as_float(optixGetAttribute_1()); + const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim); + type = segment.type; + prim = segment.prim; + +# if OPTIX_ABI_VERSION < 55 /* Filter out curve endcaps. */ if (u == 0.0f || u == 1.0f) { - ignore_intersection = true; + return optixIgnoreIntersection(); } +# endif } # endif + else { + type = kernel_tex_fetch(__objects, object).primitive_type; + u = 0.0f; + v = 0.0f; + } - int num_hits = optixGetPayload_2(); - int record_index = num_hits; - const int max_hits = optixGetPayload_3(); +# ifndef __TRANSPARENT_SHADOWS__ + /* No transparent shadows support compiled in, make opaque. */ + optixSetPayload_5(true); + return optixTerminateRay(); +# else + const uint max_hits = optixGetPayload_3(); + const uint num_hits_packed = optixGetPayload_2(); + const uint num_recorded_hits = uint16_unpack_from_uint_0(num_hits_packed); + const uint num_hits = uint16_unpack_from_uint_1(num_hits_packed); + + /* If no transparent shadows, all light is blocked and we can stop immediately. */ + if (num_hits >= max_hits || + !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) { + optixSetPayload_5(true); + return optixTerminateRay(); + } + + /* Always use baked shadow transparency for curves. */ + if (type & PRIMITIVE_CURVE) { + float throughput = __uint_as_float(optixGetPayload_1()); + throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u); + optixSetPayload_1(__float_as_uint(throughput)); + optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1)); - if (!ignore_intersection) { - optixSetPayload_2(num_hits + 1); + if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) { + optixSetPayload_5(true); + return optixTerminateRay(); + } + else { + /* Continue tracing. */ + optixIgnoreIntersection(); + return; + } } - Intersection *const isect_array = get_payload_ptr_0<Intersection>(); + /* Record transparent intersection. */ + optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits + 1, num_hits + 1)); + + uint record_index = num_recorded_hits; + + const IntegratorShadowState state = optixGetPayload_0(); -# ifdef __TRANSPARENT_SHADOWS__ - if (num_hits >= max_hits) { + const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE); + if (record_index >= max_record_hits) { /* If maximum number of hits reached, find a hit to replace. */ - const int num_recorded_hits = min(max_hits, num_hits); - float max_recorded_t = isect_array[0].t; - int max_recorded_hit = 0; + float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t); + uint max_recorded_hit = 0; - for (int i = 1; i < num_recorded_hits; i++) { - if (isect_array[i].t > max_recorded_t) { - max_recorded_t = isect_array[i].t; + for (int i = 1; i < max_record_hits; i++) { + const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t); + if (isect_t > max_recorded_t) { + max_recorded_t = isect_t; max_recorded_hit = i; } } if (optixGetRayTmax() >= max_recorded_t) { - /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the current - * hit anymore. */ + /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the + * current hit anymore. */ return; } record_index = max_recorded_hit; } -# endif - if (!ignore_intersection) { - Intersection *const isect = isect_array + record_index; - isect->u = u; - isect->v = v; - isect->t = optixGetRayTmax(); - isect->prim = prim; - isect->object = get_object_id(); - isect->type = kernel_tex_fetch(__prim_type, prim); - -# ifdef __TRANSPARENT_SHADOWS__ - /* Detect if this surface has a shader with transparent shadows. */ - if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) { -# endif - /* If no transparent shadows, all light is blocked and we can stop immediately. */ - optixSetPayload_5(true); - return optixTerminateRay(); -# ifdef __TRANSPARENT_SHADOWS__ - } -# endif - } + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax(); + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type; /* Continue tracing. */ optixIgnoreIntersection(); -#endif +# endif /* __TRANSPARENT_SHADOWS__ */ +#endif /* __SHADOW_RECORD_ALL__ */ } extern "C" __global__ void __anyhit__kernel_optix_volume_test() { -#ifdef __HAIR__ +#if defined(__HAIR__) || defined(__POINTCLOUD__) if (!optixIsTriangleHit()) { /* Ignore curves. */ return optixIgnoreIntersection(); } #endif + const uint object = get_object_id(); #ifdef __VISIBILITY_FLAG__ - const uint prim = optixGetPrimitiveIndex(); const uint visibility = optixGetPayload_4(); - if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) { + if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) { return optixIgnoreIntersection(); } #endif - const uint object = get_object_id<true>(); if ((kernel_tex_fetch(__object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) { return optixIgnoreIntersection(); } + + const int prim = optixGetPrimitiveIndex(); + ccl_private Ray *const ray = get_payload_ptr_6<Ray>(); + if (intersection_skip_self(ray->self, object, prim)) { + return optixIgnoreIntersection(); + } } extern "C" __global__ void __anyhit__kernel_optix_visibility_test() { #ifdef __HAIR__ - if (!optixIsTriangleHit()) { +# if OPTIX_ABI_VERSION < 55 + if (optixGetPrimitiveType() == OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE) { /* Filter out curve endcaps. */ const float u = __uint_as_float(optixGetAttribute_0()); if (u == 0.0f || u == 1.0f) { return optixIgnoreIntersection(); } } +# endif #endif -#ifdef __VISIBILITY_FLAG__ - const uint prim = optixGetPrimitiveIndex(); + const uint object = get_object_id(); const uint visibility = optixGetPayload_4(); - if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) { +#ifdef __VISIBILITY_FLAG__ + if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) { return optixIgnoreIntersection(); } +#endif + + const int prim = optixGetPrimitiveIndex(); + ccl_private Ray *const ray = get_payload_ptr_6<Ray>(); - /* Shadow ray early termination. */ if (visibility & PATH_RAY_SHADOW_OPAQUE) { - return optixTerminateRay(); + if (intersection_skip_self_shadow(ray->self, object, prim)) { + return optixIgnoreIntersection(); + } + else { + /* Shadow ray early termination. */ + return optixTerminateRay(); + } + } + else { + if (intersection_skip_self(ray->self, object, prim)) { + return optixIgnoreIntersection(); + } } -#endif } extern "C" __global__ void __closesthit__kernel_optix_hit() { + const int object = get_object_id(); + const int prim = optixGetPrimitiveIndex(); + optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */ - optixSetPayload_3(optixGetPrimitiveIndex()); - optixSetPayload_4(get_object_id()); - /* Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index. */ - optixSetPayload_5(kernel_tex_fetch(__prim_type, optixGetPrimitiveIndex())); + optixSetPayload_4(object); if (optixIsTriangleHit()) { const float2 barycentrics = optixGetTriangleBarycentrics(); optixSetPayload_1(__float_as_uint(1.0f - barycentrics.y - barycentrics.x)); optixSetPayload_2(__float_as_uint(barycentrics.x)); + optixSetPayload_3(prim); + optixSetPayload_5(kernel_tex_fetch(__objects, object).primitive_type); } - else { + else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) { + const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim); optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */ optixSetPayload_2(optixGetAttribute_1()); + optixSetPayload_3(segment.prim); + optixSetPayload_5(segment.type); + } + else { + optixSetPayload_1(0); + optixSetPayload_2(0); + optixSetPayload_3(prim); + optixSetPayload_5(kernel_tex_fetch(__objects, object).primitive_type); } } #ifdef __HAIR__ -ccl_device_inline void optix_intersection_curve(const uint prim, const uint type) +ccl_device_inline void optix_intersection_curve(const int prim, const int type) { - const uint object = get_object_id<true>(); + const int object = get_object_id(); + +# ifdef __VISIBILITY_FLAG__ const uint visibility = optixGetPayload_4(); + if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) { + return; + } +# endif float3 P = optixGetObjectRayOrigin(); float3 dir = optixGetObjectRayDirection(); @@ -358,7 +438,8 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type if (isect.t != FLT_MAX) isect.t *= len; - if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) { + if (curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) { + static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use"); optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL, __float_as_int(isect.u), /* Attribute_0 */ @@ -368,11 +449,53 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type extern "C" __global__ void __intersection__curve_ribbon() { - const uint prim = optixGetPrimitiveIndex(); - const uint type = kernel_tex_fetch(__prim_type, prim); - - if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) { + const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, optixGetPrimitiveIndex()); + const int prim = segment.prim; + const int type = segment.type; + if (type & PRIMITIVE_CURVE_RIBBON) { optix_intersection_curve(prim, type); } } + +#endif + +#ifdef __POINTCLOUD__ +extern "C" __global__ void __intersection__point() +{ + const int prim = optixGetPrimitiveIndex(); + const int object = get_object_id(); + const int type = kernel_tex_fetch(__objects, object).primitive_type; + +# ifdef __VISIBILITY_FLAG__ + const uint visibility = optixGetPayload_4(); + if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) { + return; + } +# endif + + float3 P = optixGetObjectRayOrigin(); + float3 dir = optixGetObjectRayDirection(); + + /* The direction is not normalized by default, the point intersection routine expects that. */ + float len; + dir = normalize_len(dir, &len); + +# ifdef __OBJECT_MOTION__ + const float time = optixGetRayTime(); +# else + const float time = 0.0f; +# endif + + Intersection isect; + isect.t = optixGetRayTmax(); + /* Transform maximum distance into object space. */ + if (isect.t != FLT_MAX) { + isect.t *= len; + } + + if (point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) { + static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use"); + optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL); + } +} #endif diff --git a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu index bf787e29eaa..071e9deae0b 100644 --- a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu +++ b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu @@ -18,7 +18,8 @@ * much longer to compiler. This is only loaded when needed by the scene. */ #include "kernel/device/optix/kernel.cu" -#include "kernel/integrator/integrator_shade_surface.h" + +#include "kernel/integrator/shade_surface.h" extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_raytrace() { |