Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel/device')
-rw-r--r--intern/cycles/kernel/device/cpu/compat.h12
-rw-r--r--intern/cycles/kernel/device/cpu/globals.h13
-rw-r--r--intern/cycles/kernel/device/cpu/image.h8
-rw-r--r--intern/cycles/kernel/device/cpu/kernel.cpp6
-rw-r--r--intern/cycles/kernel/device/cpu/kernel.h19
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_arch.h68
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_arch_impl.h202
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_avx.cpp2
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_avx2.cpp2
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_sse2.cpp2
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_sse3.cpp2
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_sse41.cpp2
-rw-r--r--intern/cycles/kernel/device/cuda/compat.h17
-rw-r--r--intern/cycles/kernel/device/cuda/config.h19
-rw-r--r--intern/cycles/kernel/device/cuda/globals.h12
-rw-r--r--intern/cycles/kernel/device/gpu/image.h16
-rw-r--r--intern/cycles/kernel/device/gpu/kernel.h946
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_active_index.h116
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_prefix_sum.h14
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_sorted_index.h26
-rw-r--r--intern/cycles/kernel/device/gpu/work_stealing.h55
-rw-r--r--intern/cycles/kernel/device/hip/compat.h10
-rw-r--r--intern/cycles/kernel/device/hip/config.h19
-rw-r--r--intern/cycles/kernel/device/hip/globals.h13
-rw-r--r--intern/cycles/kernel/device/metal/compat.h316
-rw-r--r--intern/cycles/kernel/device/metal/context_begin.h83
-rw-r--r--intern/cycles/kernel/device/metal/context_end.h23
-rw-r--r--intern/cycles/kernel/device/metal/globals.h51
-rw-r--r--intern/cycles/kernel/device/metal/kernel.metal765
-rw-r--r--intern/cycles/kernel/device/optix/compat.h18
-rw-r--r--intern/cycles/kernel/device/optix/globals.h12
-rw-r--r--intern/cycles/kernel/device/optix/kernel.cu319
-rw-r--r--intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu3
33 files changed, 2480 insertions, 711 deletions
diff --git a/intern/cycles/kernel/device/cpu/compat.h b/intern/cycles/kernel/device/cpu/compat.h
index bfd936c7bbd..5ccca52255f 100644
--- a/intern/cycles/kernel/device/cpu/compat.h
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -26,13 +26,11 @@
# pragma GCC diagnostic ignored "-Wuninitialized"
#endif
-#include "util/util_half.h"
-#include "util/util_math.h"
-#include "util/util_simd.h"
-#include "util/util_texture.h"
-#include "util/util_types.h"
-
-#define ccl_addr_space
+#include "util/half.h"
+#include "util/math.h"
+#include "util/simd.h"
+#include "util/texture.h"
+#include "util/types.h"
/* On x86_64, versions of glibc < 2.16 have an issue where expf is
* much slower than the double version. This was fixed in glibc 2.16.
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
index 98b036e269d..746e48b9880 100644
--- a/intern/cycles/kernel/device/cpu/globals.h
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -18,8 +18,9 @@
#pragma once
-#include "kernel/kernel_profiling.h"
-#include "kernel/kernel_types.h"
+#include "kernel/tables.h"
+#include "kernel/types.h"
+#include "kernel/util/profiling.h"
CCL_NAMESPACE_BEGIN
@@ -34,9 +35,9 @@ struct OSLThreadData;
struct OSLShadingSystem;
#endif
-typedef struct KernelGlobals {
+typedef struct KernelGlobalsCPU {
#define KERNEL_TEX(type, name) texture<type> name;
-#include "kernel/kernel_textures.h"
+#include "kernel/textures.h"
KernelData __data;
@@ -51,7 +52,9 @@ typedef struct KernelGlobals {
/* **** Run-time data **** */
ProfilingState profiler;
-} KernelGlobals;
+} KernelGlobalsCPU;
+
+typedef const KernelGlobalsCPU *ccl_restrict KernelGlobals;
/* Abstraction macros */
#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h
index 57e81ab186d..93f956e354d 100644
--- a/intern/cycles/kernel/device/cpu/image.h
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -72,12 +72,12 @@ template<typename T> struct TextureInterpolator {
static ccl_always_inline float4 read(half4 r)
{
- return half4_to_float4(r);
+ return half4_to_float4_image(r);
}
static ccl_always_inline float4 read(half r)
{
- float f = half_to_float(r);
+ float f = half_to_float_image(r);
return make_float4(f, f, f, 1.0f);
}
@@ -583,7 +583,7 @@ template<typename T> struct NanoVDBInterpolator {
#undef SET_CUBIC_SPLINE_WEIGHTS
-ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
{
const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
@@ -611,7 +611,7 @@ ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float
}
}
-ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
int id,
float3 P,
InterpolationType interp)
diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
index ac1cdf5fffe..a16c637d5ac 100644
--- a/intern/cycles/kernel/device/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -64,7 +64,7 @@ CCL_NAMESPACE_BEGIN
/* Memory Copy */
-void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t)
+void kernel_const_copy(KernelGlobalsCPU *kg, const char *name, void *host, size_t)
{
if (strcmp(name, "__data") == 0) {
kg->__data = *(KernelData *)host;
@@ -74,7 +74,7 @@ void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t)
}
}
-void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size)
+void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem, size_t size)
{
if (0) {
}
@@ -85,7 +85,7 @@ void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, s
kg->tname.data = (type *)mem; \
kg->tname.width = size; \
}
-#include "kernel/kernel_textures.h"
+#include "kernel/textures.h"
else {
assert(0);
}
diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
index ae2a841835a..6af8094b1ea 100644
--- a/intern/cycles/kernel/device/cpu/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -18,9 +18,10 @@
/* CPU Kernel Interface */
-#include "util/util_types.h"
+#include "util/half.h"
+#include "util/types.h"
-#include "kernel/kernel_types.h"
+#include "kernel/types.h"
CCL_NAMESPACE_BEGIN
@@ -29,17 +30,17 @@ CCL_NAMESPACE_BEGIN
#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
struct IntegratorStateCPU;
-struct KernelGlobals;
+struct KernelGlobalsCPU;
struct KernelData;
-KernelGlobals *kernel_globals_create();
-void kernel_globals_free(KernelGlobals *kg);
+KernelGlobalsCPU *kernel_globals_create();
+void kernel_globals_free(KernelGlobalsCPU *kg);
-void *kernel_osl_memory(const KernelGlobals *kg);
-bool kernel_osl_use(const KernelGlobals *kg);
+void *kernel_osl_memory(const KernelGlobalsCPU *kg);
+bool kernel_osl_use(const KernelGlobalsCPU *kg);
-void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
-void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
+void kernel_const_copy(KernelGlobalsCPU *kg, const char *name, void *host, size_t size);
+void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem, size_t size);
#define KERNEL_ARCH cpu
#include "kernel/device/cpu/kernel_arch.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
index 81f328c710b..61f62f3136b 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -21,23 +21,23 @@
*/
#define KERNEL_INTEGRATOR_FUNCTION(name) \
- void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
IntegratorStateCPU *state)
#define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \
- void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
IntegratorStateCPU *state, \
ccl_global float *render_buffer)
#define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \
- bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+ bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
IntegratorStateCPU *state, \
KernelWorkTile *tile, \
ccl_global float *render_buffer)
KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
-KERNEL_INTEGRATOR_FUNCTION(intersect_closest);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(intersect_closest);
KERNEL_INTEGRATOR_FUNCTION(intersect_shadow);
KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface);
KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack);
@@ -52,25 +52,61 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
#undef KERNEL_INTEGRATOR_INIT_FUNCTION
#undef KERNEL_INTEGRATOR_SHADE_FUNCTION
+#define KERNEL_FILM_CONVERT_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+ const float *buffer, \
+ float *pixel, \
+ const int width, \
+ const int buffer_stride, \
+ const int pixel_stride); \
+ void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+ const KernelFilmConvert *kfilm_convert, \
+ const float *buffer, \
+ half4 *pixel, \
+ const int width, \
+ const int buffer_stride);
+
+KERNEL_FILM_CONVERT_FUNCTION(depth)
+KERNEL_FILM_CONVERT_FUNCTION(mist)
+KERNEL_FILM_CONVERT_FUNCTION(sample_count)
+KERNEL_FILM_CONVERT_FUNCTION(float)
+
+KERNEL_FILM_CONVERT_FUNCTION(light_path)
+KERNEL_FILM_CONVERT_FUNCTION(float3)
+
+KERNEL_FILM_CONVERT_FUNCTION(motion)
+KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
+KERNEL_FILM_CONVERT_FUNCTION(combined)
+KERNEL_FILM_CONVERT_FUNCTION(float4)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
/* --------------------------------------------------------------------
* Shader evaluation.
*/
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg,
const KernelShaderEvalInput *input,
- float4 *output,
+ float *output,
const int offset);
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
const KernelShaderEvalInput *input,
- float4 *output,
+ float *output,
const int offset);
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
+ const KernelGlobalsCPU *kg,
+ const KernelShaderEvalInput *input,
+ float *output,
+ const int offset);
/* --------------------------------------------------------------------
* Adaptive sampling.
*/
bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
- const KernelGlobals *kg,
+ const KernelGlobalsCPU *kg,
ccl_global float *render_buffer,
int x,
int y,
@@ -79,14 +115,14 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
int offset,
int stride);
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg,
ccl_global float *render_buffer,
int y,
int start_x,
int width,
int offset,
int stride);
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg,
ccl_global float *render_buffer,
int x,
int start_y,
@@ -98,16 +134,8 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *
* Cryptomatte.
*/
-void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg,
ccl_global float *render_buffer,
int pixel_index);
-/* --------------------------------------------------------------------
- * Bake.
- */
-/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
-
-void KERNEL_FUNCTION_FULL_NAME(bake)(
- const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
-
#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
index 1432abfd330..747c47c34c9 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -29,27 +29,28 @@
# include "kernel/device/cpu/globals.h"
# include "kernel/device/cpu/image.h"
-# include "kernel/integrator/integrator_state.h"
-# include "kernel/integrator/integrator_state_flow.h"
-# include "kernel/integrator/integrator_state_util.h"
-
-# include "kernel/integrator/integrator_init_from_camera.h"
-# include "kernel/integrator/integrator_init_from_bake.h"
-# include "kernel/integrator/integrator_intersect_closest.h"
-# include "kernel/integrator/integrator_intersect_shadow.h"
-# include "kernel/integrator/integrator_intersect_subsurface.h"
-# include "kernel/integrator/integrator_intersect_volume_stack.h"
-# include "kernel/integrator/integrator_shade_background.h"
-# include "kernel/integrator/integrator_shade_light.h"
-# include "kernel/integrator/integrator_shade_shadow.h"
-# include "kernel/integrator/integrator_shade_surface.h"
-# include "kernel/integrator/integrator_shade_volume.h"
-# include "kernel/integrator/integrator_megakernel.h"
-
-# include "kernel/kernel_film.h"
-# include "kernel/kernel_adaptive_sampling.h"
-# include "kernel/kernel_bake.h"
-# include "kernel/kernel_id_passes.h"
+# include "kernel/integrator/state.h"
+# include "kernel/integrator/state_flow.h"
+# include "kernel/integrator/state_util.h"
+
+# include "kernel/integrator/init_from_camera.h"
+# include "kernel/integrator/init_from_bake.h"
+# include "kernel/integrator/intersect_closest.h"
+# include "kernel/integrator/intersect_shadow.h"
+# include "kernel/integrator/intersect_subsurface.h"
+# include "kernel/integrator/intersect_volume_stack.h"
+# include "kernel/integrator/shade_background.h"
+# include "kernel/integrator/shade_light.h"
+# include "kernel/integrator/shade_shadow.h"
+# include "kernel/integrator/shade_surface.h"
+# include "kernel/integrator/shade_volume.h"
+# include "kernel/integrator/megakernel.h"
+
+# include "kernel/film/adaptive_sampling.h"
+# include "kernel/film/id_passes.h"
+# include "kernel/film/read.h"
+
+# include "kernel/bake/bake.h"
#else
# define STUB_ASSERT(arch, name) \
@@ -69,8 +70,20 @@ CCL_NAMESPACE_BEGIN
# define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__)
#endif
+/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
+ * that it does not contain unused fields. */
+#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
+ bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+ IntegratorStateCPU *state, \
+ KernelWorkTile *tile, \
+ ccl_global float *render_buffer) \
+ { \
+ return KERNEL_INVOKE( \
+ name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+ }
+
#define DEFINE_INTEGRATOR_KERNEL(name) \
- void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
IntegratorStateCPU *state) \
{ \
KERNEL_INVOKE(name, kg, state); \
@@ -78,43 +91,45 @@ CCL_NAMESPACE_BEGIN
#define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \
void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
- const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+ const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
{ \
KERNEL_INVOKE(name, kg, state, render_buffer); \
}
-/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
- * that it does not contain unused fields. */
-#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
- bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
- IntegratorStateCPU *state, \
- KernelWorkTile *tile, \
- ccl_global float *render_buffer) \
+#define DEFINE_INTEGRATOR_SHADOW_KERNEL(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+ IntegratorStateCPU *state) \
{ \
- return KERNEL_INVOKE( \
- name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+ KERNEL_INVOKE(name, kg, &state->shadow); \
+ }
+
+#define DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+ const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+ { \
+ KERNEL_INVOKE(name, kg, &state->shadow, render_buffer); \
}
DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
-DEFINE_INTEGRATOR_KERNEL(intersect_closest)
-DEFINE_INTEGRATOR_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_SHADE_KERNEL(intersect_closest)
DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light)
-DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow)
DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface)
DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume)
DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
+DEFINE_INTEGRATOR_SHADOW_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(shade_shadow)
/* --------------------------------------------------------------------
* Shader evaluation.
*/
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
const KernelShaderEvalInput *input,
- float4 *output,
+ float *output,
const int offset)
{
#ifdef KERNEL_STUB
@@ -124,9 +139,9 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
#endif
}
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg,
const KernelShaderEvalInput *input,
- float4 *output,
+ float *output,
const int offset)
{
#ifdef KERNEL_STUB
@@ -136,12 +151,25 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
#endif
}
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
+ const KernelGlobalsCPU *kg,
+ const KernelShaderEvalInput *input,
+ float *output,
+ const int offset)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, shader_eval_curve_shadow_transparency);
+#else
+ kernel_curve_shadow_transparency_evaluate(kg, input, output, offset);
+#endif
+}
+
/* --------------------------------------------------------------------
* Adaptive sampling.
*/
bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
- const KernelGlobals *kg,
+ const KernelGlobalsCPU *kg,
ccl_global float *render_buffer,
int x,
int y,
@@ -159,7 +187,7 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
#endif
}
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg,
ccl_global float *render_buffer,
int y,
int start_x,
@@ -174,7 +202,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *
#endif
}
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg,
ccl_global float *render_buffer,
int x,
int start_y,
@@ -193,7 +221,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *
* Cryptomatte.
*/
-void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg,
ccl_global float *render_buffer,
int pixel_index)
{
@@ -205,23 +233,83 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
}
/* --------------------------------------------------------------------
- * Bake.
+ * Film Convert.
*/
-/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
-void KERNEL_FUNCTION_FULL_NAME(bake)(
- const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
-{
-#if 0
-# ifdef KERNEL_STUB
- STUB_ASSERT(KERNEL_ARCH, bake);
-# else
-# ifdef __BAKING__
- kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
-# endif
-# endif /* KERNEL_STUB */
+#ifdef KERNEL_STUB
+
+# define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
+ void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+ const float *buffer, \
+ float *pixel, \
+ const int width, \
+ const int buffer_stride, \
+ const int pixel_stride) \
+ { \
+ STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
+ } \
+ void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+ const KernelFilmConvert *kfilm_convert, \
+ const float *buffer, \
+ half4 *pixel, \
+ const int width, \
+ const int buffer_stride) \
+ { \
+ STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
+ }
+
+#else
+
+# define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
+ void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+ const float *buffer, \
+ float *pixel, \
+ const int width, \
+ const int buffer_stride, \
+ const int pixel_stride) \
+ { \
+ for (int i = 0; i < width; i++, buffer += buffer_stride, pixel += pixel_stride) { \
+ film_get_pass_pixel_##name(kfilm_convert, buffer, pixel); \
+ } \
+ } \
+ void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+ const KernelFilmConvert *kfilm_convert, \
+ const float *buffer, \
+ half4 *pixel, \
+ const int width, \
+ const int buffer_stride) \
+ { \
+ for (int i = 0; i < width; i++, buffer += buffer_stride, pixel++) { \
+ float pixel_rgba[4] = {0.0f, 0.0f, 0.0f, 1.0f}; \
+ film_get_pass_pixel_##name(kfilm_convert, buffer, pixel_rgba); \
+ if (is_float) { \
+ pixel_rgba[1] = pixel_rgba[0]; \
+ pixel_rgba[2] = pixel_rgba[0]; \
+ } \
+ film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba); \
+ *pixel = float4_to_half4_display( \
+ make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3])); \
+ } \
+ }
+
#endif
-}
+
+KERNEL_FILM_CONVERT_FUNCTION(depth, true)
+KERNEL_FILM_CONVERT_FUNCTION(mist, true)
+KERNEL_FILM_CONVERT_FUNCTION(sample_count, true)
+KERNEL_FILM_CONVERT_FUNCTION(float, true)
+
+KERNEL_FILM_CONVERT_FUNCTION(light_path, false)
+KERNEL_FILM_CONVERT_FUNCTION(float3, false)
+
+KERNEL_FILM_CONVERT_FUNCTION(motion, false)
+KERNEL_FILM_CONVERT_FUNCTION(cryptomatte, false)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher, false)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow, false)
+KERNEL_FILM_CONVERT_FUNCTION(combined, false)
+KERNEL_FILM_CONVERT_FUNCTION(float4, false)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
#undef KERNEL_INVOKE
#undef DEFINE_INTEGRATOR_KERNEL
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
index 220768036ab..cece750a255 100644
--- a/intern/cycles/kernel/device/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
@@ -18,7 +18,7 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-#include "util/util_optimization.h"
+#include "util/optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# define KERNEL_STUB
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
index 90c05113cbe..fad4581236e 100644
--- a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -18,7 +18,7 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-#include "util/util_optimization.h"
+#include "util/optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# define KERNEL_STUB
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
index fb85ef5b0d0..5fb4849ac08 100644
--- a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@@ -18,7 +18,7 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-#include "util/util_optimization.h"
+#include "util/optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# define KERNEL_STUB
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
index 87baf04258a..c9424682fd4 100644
--- a/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
@@ -18,7 +18,7 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-#include "util/util_optimization.h"
+#include "util/optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# define KERNEL_STUB
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
index bb421d58815..849ebf51989 100644
--- a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
@@ -18,7 +18,7 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-#include "util/util_optimization.h"
+#include "util/optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# define KERNEL_STUB
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index 3c85a8e7bd2..658dec102b1 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -52,14 +52,14 @@ typedef unsigned long long uint64_t;
#endif
#define ccl_device_noinline __device__ __noinline__
#define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
#define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
#define ccl_device_constant __constant__ __device__
#define ccl_constant const
#define ccl_gpu_shared __shared__
#define ccl_private
#define ccl_may_alias
-#define ccl_addr_space
#define ccl_restrict __restrict__
#define ccl_loop_no_unroll
#define ccl_align(n) __align__(n)
@@ -76,6 +76,7 @@ typedef unsigned long long uint64_t;
#define ccl_gpu_block_idx_x (blockIdx.x)
#define ccl_gpu_grid_dim_x (gridDim.x)
#define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
@@ -85,7 +86,6 @@ typedef unsigned long long uint64_t;
#define ccl_gpu_syncthreads() __syncthreads()
#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)
/* GPU texture objects */
@@ -129,7 +129,14 @@ __device__ half __float2half(const float f)
return val;
}
+__device__ float __half2float(const half h)
+{
+ float val;
+ asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
+ return val;
+}
+
/* Types */
-#include "util/util_half.h"
-#include "util/util_types.h"
+#include "util/half.h"
+#include "util/types.h"
diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h
index 46196dcdb51..003881d7912 100644
--- a/intern/cycles/kernel/device/cuda/config.h
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -92,12 +92,29 @@
/* Compute number of threads per block and minimum blocks per multiprocessor
* given the maximum number of registers per thread. */
-
#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
extern "C" __global__ void __launch_bounds__(block_num_threads, \
GPU_MULTIPRESSOR_MAX_REGISTERS / \
(block_num_threads * thread_num_registers))
+#define ccl_gpu_kernel_threads(block_num_threads) \
+ extern "C" __global__ void __launch_bounds__(block_num_threads)
+
+#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
+
+#define ccl_gpu_kernel_call(x) x
+
+/* Define a function object where "func" is the lambda body, and additional parameters are used to
+ * specify captured state */
+#define ccl_gpu_kernel_lambda(func, ...) \
+ struct KernelLambda { \
+ __VA_ARGS__; \
+ __device__ int operator()(const int state) \
+ { \
+ return (func); \
+ } \
+ } ccl_gpu_kernel_lambda_pass
+
/* sanity checks */
#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
diff --git a/intern/cycles/kernel/device/cuda/globals.h b/intern/cycles/kernel/device/cuda/globals.h
index 169047175f5..e5023fad40c 100644
--- a/intern/cycles/kernel/device/cuda/globals.h
+++ b/intern/cycles/kernel/device/cuda/globals.h
@@ -18,23 +18,25 @@
#pragma once
-#include "kernel/kernel_profiling.h"
-#include "kernel/kernel_types.h"
+#include "kernel/types.h"
-#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/state.h"
+
+#include "kernel/util/profiling.h"
CCL_NAMESPACE_BEGIN
/* Not actually used, just a NULL pointer that gets passed everywhere, which we
* hope gets optimized out by the compiler. */
-struct KernelGlobals {
+struct KernelGlobalsGPU {
int unused[1];
};
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
/* Global scene data and textures */
__constant__ KernelData __data;
#define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
-#include "kernel/kernel_textures.h"
+#include "kernel/textures.h"
/* Integrator state */
__constant__ IntegratorStateGPU __integrator_state;
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
index b015c78a8f5..0900a45c83d 100644
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -65,7 +65,9 @@ ccl_device float cubic_h1(float a)
/* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
template<typename T>
-ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureInfo &info,
+ float x,
+ float y)
{
ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
@@ -94,7 +96,7 @@ ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, f
/* Fast tricubic texture lookup using 8 trilinear lookups. */
template<typename T>
ccl_device_noinline T
-kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, float y, float z)
{
ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
@@ -169,7 +171,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl
template<typename T>
ccl_device_noinline T kernel_tex_image_interp_nanovdb(
- const TextureInfo &info, float x, float y, float z, uint interpolation)
+ ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
{
using namespace nanovdb;
@@ -189,9 +191,9 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb(
}
#endif
-ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
{
- const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+ ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
/* float4, byte4, ushort4 and half4 */
const int texture_type = info.data_type;
@@ -221,12 +223,12 @@ ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float
}
}
-ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
int id,
float3 P,
InterpolationType interp)
{
- const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+ ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
if (info.use_transform_3d) {
P = transform_point(&info.transform_3d, P);
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 7b79c0aedfa..eed005803e2 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -20,47 +20,64 @@
#include "kernel/device/gpu/parallel_prefix_sum.h"
#include "kernel/device/gpu/parallel_sorted_index.h"
-#include "kernel/integrator/integrator_state.h"
-#include "kernel/integrator/integrator_state_flow.h"
-#include "kernel/integrator/integrator_state_util.h"
-
-#include "kernel/integrator/integrator_init_from_bake.h"
-#include "kernel/integrator/integrator_init_from_camera.h"
-#include "kernel/integrator/integrator_intersect_closest.h"
-#include "kernel/integrator/integrator_intersect_shadow.h"
-#include "kernel/integrator/integrator_intersect_subsurface.h"
-#include "kernel/integrator/integrator_intersect_volume_stack.h"
-#include "kernel/integrator/integrator_shade_background.h"
-#include "kernel/integrator/integrator_shade_light.h"
-#include "kernel/integrator/integrator_shade_shadow.h"
-#include "kernel/integrator/integrator_shade_surface.h"
-#include "kernel/integrator/integrator_shade_volume.h"
-
-#include "kernel/kernel_adaptive_sampling.h"
-#include "kernel/kernel_bake.h"
-#include "kernel/kernel_film.h"
-#include "kernel/kernel_work_stealing.h"
+#include "kernel/sample/lcg.h"
+
+/* Include constant tables before entering Metal's context class scope (context_begin.h) */
+#include "kernel/tables.h"
+
+#ifdef __KERNEL_METAL__
+# include "kernel/device/metal/context_begin.h"
+#endif
+
+#include "kernel/device/gpu/work_stealing.h"
+
+#include "kernel/integrator/state.h"
+#include "kernel/integrator/state_flow.h"
+#include "kernel/integrator/state_util.h"
+
+#include "kernel/integrator/init_from_bake.h"
+#include "kernel/integrator/init_from_camera.h"
+#include "kernel/integrator/intersect_closest.h"
+#include "kernel/integrator/intersect_shadow.h"
+#include "kernel/integrator/intersect_subsurface.h"
+#include "kernel/integrator/intersect_volume_stack.h"
+#include "kernel/integrator/shade_background.h"
+#include "kernel/integrator/shade_light.h"
+#include "kernel/integrator/shade_shadow.h"
+#include "kernel/integrator/shade_surface.h"
+#include "kernel/integrator/shade_volume.h"
+
+#include "kernel/bake/bake.h"
+
+#include "kernel/film/adaptive_sampling.h"
+
+#ifdef __KERNEL_METAL__
+# include "kernel/device/metal/context_end.h"
+#endif
+
+#include "kernel/film/read.h"
/* --------------------------------------------------------------------
* Integrator.
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_reset(int num_states)
+ ccl_gpu_kernel_signature(integrator_reset, int num_states)
{
const int state = ccl_gpu_global_id_x();
if (state < num_states) {
- INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
- INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+ INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
+ INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
- const int num_tiles,
- float *render_buffer,
- const int max_tile_work_size)
+ ccl_gpu_kernel_signature(integrator_init_from_camera,
+ ccl_global KernelWorkTile *tiles,
+ const int num_tiles,
+ ccl_global float *render_buffer,
+ const int max_tile_work_size)
{
const int work_index = ccl_gpu_global_id_x();
@@ -71,7 +88,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int tile_index = work_index / max_tile_work_size;
const int tile_work_index = work_index - tile_index * max_tile_work_size;
- const KernelWorkTile *tile = &tiles[tile_index];
+ ccl_global const KernelWorkTile *tile = &tiles[tile_index];
if (tile_work_index >= tile->work_size) {
return;
@@ -80,16 +97,18 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int state = tile->path_index_offset + tile_work_index;
uint x, y, sample;
- get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+ ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));
- integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+ ccl_gpu_kernel_call(
+ integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample));
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
- const int num_tiles,
- float *render_buffer,
- const int max_tile_work_size)
+ ccl_gpu_kernel_signature(integrator_init_from_bake,
+ ccl_global KernelWorkTile *tiles,
+ const int num_tiles,
+ ccl_global float *render_buffer,
+ const int max_tile_work_size)
{
const int work_index = ccl_gpu_global_id_x();
@@ -100,7 +119,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int tile_index = work_index / max_tile_work_size;
const int tile_work_index = work_index - tile_index * max_tile_work_size;
- const KernelWorkTile *tile = &tiles[tile_index];
+ ccl_global const KernelWorkTile *tile = &tiles[tile_index];
if (tile_work_index >= tile->work_size) {
return;
@@ -109,211 +128,312 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int state = tile->path_index_offset + tile_work_index;
uint x, y, sample;
- get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+ ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));
- integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+ ccl_gpu_kernel_call(
+ integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample));
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_closest,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_closest(NULL, state);
+ ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_shadow,
+ ccl_global const int *path_index_array,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_shadow(NULL, state);
+ ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_subsurface,
+ ccl_global const int *path_index_array,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_subsurface(NULL, state);
+ ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_volume_stack,
+ ccl_global const int *path_index_array,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_volume_stack(NULL, state);
+ ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_background(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_background,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_background(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_light(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_light,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_light(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_shadow(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_shadow,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_shadow(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_surface(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_surface,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_surface(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer));
}
}
+#ifdef __KERNEL_METAL__
+constant int __dummy_constant [[function_constant(0)]];
+#endif
+
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_surface_raytrace(NULL, state, render_buffer);
+
+#ifdef __KERNEL_METAL__
+ KernelGlobals kg = NULL;
+ /* Workaround Ambient Occlusion and Bevel nodes not working with Metal.
+ * Dummy offset should not affect result, but somehow fixes bug! */
+ kg += __dummy_constant;
+ ccl_gpu_kernel_call(integrator_shade_surface_raytrace(kg, state, render_buffer));
+#else
+ ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer));
+#endif
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_volume(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_volume,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_volume(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer));
}
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_queued_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int kernel)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_queued_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int kernel_index)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
+ int kernel_index);
+ ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [kernel](const int state) {
- return (INTEGRATOR_STATE(path, queued_kernel) == kernel);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int kernel)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int kernel_index)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
+ int kernel_index);
+ ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [kernel](const int state) {
- return (INTEGRATOR_STATE(shadow_path, queued_kernel) == kernel);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_active_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [](const int state) {
- return (INTEGRATOR_STATE(path, queued_kernel) != 0) ||
- (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_terminated_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int indices_offset)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_terminated_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int indices_offset)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices + indices_offset, num_indices, [](const int state) {
- return (INTEGRATOR_STATE(path, queued_kernel) == 0) &&
- (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
- });
+ num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_sorted_paths_array(
- int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int indices_offset)
{
- gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
- return (INTEGRATOR_STATE(path, queued_kernel) == kernel) ?
- INTEGRATOR_STATE(path, shader_sort_key) :
- GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
- });
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_sorted_paths_array,
+ int num_states,
+ int num_states_limit,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ ccl_global int *key_counter,
+ ccl_global int *key_prefix_sum,
+ int kernel_index)
+{
+ ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
+ INTEGRATOR_STATE(state, path, shader_sort_key) :
+ GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY,
+ int kernel_index);
+ ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
+ const uint state_index = ccl_gpu_global_id_x();
+ gpu_parallel_sorted_index_array(state_index,
+ num_states,
+ num_states_limit,
+ indices,
+ num_indices,
+ key_counter,
+ key_prefix_sum,
+ ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int num_active_paths)
+{
+ ccl_gpu_kernel_lambda((state >= num_active_paths) &&
+ (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
+ int num_active_paths);
+ ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
+
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_states,
+ ccl_global const int *active_terminated_states,
+ const int active_states_offset,
+ const int terminated_states_offset,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int from_state = active_terminated_states[active_states_offset + global_index];
+ const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+ ccl_gpu_kernel_call(integrator_state_move(NULL, to_state, from_state));
+ }
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_compact_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int num_active_paths)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int num_active_paths)
{
+ ccl_gpu_kernel_lambda((state >= num_active_paths) &&
+ (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
+ int num_active_paths);
+ ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [num_active_paths](const int state) {
- return (state >= num_active_paths) &&
- ((INTEGRATOR_STATE(path, queued_kernel) != 0) ||
- (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0));
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_compact_states(const int *active_terminated_states,
- const int active_states_offset,
- const int terminated_states_offset,
- const int work_size)
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_shadow_states,
+ ccl_global const int *active_terminated_states,
+ const int active_states_offset,
+ const int terminated_states_offset,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
@@ -321,14 +441,14 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
const int from_state = active_terminated_states[active_states_offset + global_index];
const int to_state = active_terminated_states[terminated_states_offset + global_index];
- integrator_state_move(to_state, from_state);
+ ccl_gpu_kernel_call(integrator_shadow_state_move(NULL, to_state, from_state));
}
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
- kernel_gpu_prefix_sum(int *values, int num_values)
+ccl_gpu_kernel_threads(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature(
+ prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values)
{
- gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+ gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values);
}
/* --------------------------------------------------------------------
@@ -336,16 +456,17 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLO
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
- int sx,
- int sy,
- int sw,
- int sh,
- float threshold,
- bool reset,
- int offset,
- int stride,
- uint *num_active_pixels)
+ ccl_gpu_kernel_signature(adaptive_sampling_convergence_check,
+ ccl_global float *render_buffer,
+ int sx,
+ int sy,
+ int sw,
+ int sh,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride,
+ ccl_global uint *num_active_pixels)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / sw;
@@ -354,37 +475,51 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
bool converged = true;
if (x < sw && y < sh) {
- converged = kernel_adaptive_sampling_convergence_check(
- nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+ converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check(
+ nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
}
/* NOTE: All threads specified in the mask must execute the intrinsic. */
- const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+ const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
if (lane_id == 0) {
- atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+ atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_adaptive_sampling_filter_x(
- float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+ ccl_gpu_kernel_signature(adaptive_sampling_filter_x,
+ ccl_global float *render_buffer,
+ int sx,
+ int sy,
+ int sw,
+ int sh,
+ int offset,
+ int stride)
{
const int y = ccl_gpu_global_id_x();
if (y < sh) {
- kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+ ccl_gpu_kernel_call(
+ kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_adaptive_sampling_filter_y(
- float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+ ccl_gpu_kernel_signature(adaptive_sampling_filter_y,
+ ccl_global float *render_buffer,
+ int sx,
+ int sy,
+ int sw,
+ int sh,
+ int offset,
+ int stride)
{
const int x = ccl_gpu_global_id_x();
if (x < sw) {
- kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+ ccl_gpu_kernel_call(
+ kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
}
}
@@ -393,12 +528,14 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+ ccl_gpu_kernel_signature(cryptomatte_postprocess,
+ ccl_global float *render_buffer,
+ int num_pixels)
{
const int pixel_index = ccl_gpu_global_id_x();
if (pixel_index < num_pixels) {
- kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+ ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index));
}
}
@@ -406,202 +543,142 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
* Film.
*/
-/* Common implementation for float destination. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
- float *pixels,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int dst_offset,
- int dst_stride,
- const Processor &processor)
-{
- const int render_pixel_index = ccl_gpu_global_id_x();
- if (render_pixel_index >= num_pixels) {
- return;
- }
-
- const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
- ccl_global const float *buffer = render_buffer + render_buffer_offset;
- ccl_global float *pixel = pixels +
- (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
-
- processor(kfilm_convert, buffer, pixel);
-}
-
-/* Common implementation for half4 destination and 4-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
- const KernelFilmConvert *kfilm_convert,
- uchar4 *rgba,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int rgba_offset,
- int rgba_stride,
- const Processor &processor)
-{
- const int render_pixel_index = ccl_gpu_global_id_x();
- if (render_pixel_index >= num_pixels) {
- return;
- }
-
- const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
- ccl_global const float *buffer = render_buffer + render_buffer_offset;
-
- float pixel[4];
- processor(kfilm_convert, buffer, pixel);
-
- film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
-
- const int x = render_pixel_index % width;
- const int y = render_pixel_index / width;
-
+ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgba,
+ const int rgba_offset,
+ const int rgba_stride,
+ const int x,
+ const int y,
+ const half4 half_pixel)
+{
+ /* Work around HIP issue with half float display, see T92972. */
+#ifdef __KERNEL_HIP__
+ ccl_global half *out = ((ccl_global half *)rgba) + (rgba_offset + y * rgba_stride + x) * 4;
+ out[0] = half_pixel.x;
+ out[1] = half_pixel.y;
+ out[2] = half_pixel.z;
+ out[3] = half_pixel.w;
+#else
ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
- float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
-}
-
-/* Common implementation for half4 destination and 3-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
- const KernelFilmConvert *kfilm_convert,
- uchar4 *rgba,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int rgba_offset,
- int rgba_stride,
- const Processor &processor)
-{
- kernel_gpu_film_convert_half_rgba_common_rgba(
- kfilm_convert,
- rgba,
- render_buffer,
- num_pixels,
- width,
- offset,
- stride,
- rgba_offset,
- rgba_stride,
- [&processor](const KernelFilmConvert *kfilm_convert,
- ccl_global const float *buffer,
- float *pixel_rgba) {
- processor(kfilm_convert, buffer, pixel_rgba);
- pixel_rgba[3] = 1.0f;
- });
-}
-
-/* Common implementation for half4 destination and single channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
- const KernelFilmConvert *kfilm_convert,
- uchar4 *rgba,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int rgba_offset,
- int rgba_stride,
- const Processor &processor)
-{
- kernel_gpu_film_convert_half_rgba_common_rgba(
- kfilm_convert,
- rgba,
- render_buffer,
- num_pixels,
- width,
- offset,
- stride,
- rgba_offset,
- rgba_stride,
- [&processor](const KernelFilmConvert *kfilm_convert,
- ccl_global const float *buffer,
- float *pixel_rgba) {
- float value;
- processor(kfilm_convert, buffer, &value);
-
- pixel_rgba[0] = value;
- pixel_rgba[1] = value;
- pixel_rgba[2] = value;
- pixel_rgba[3] = 1.0f;
- });
-}
-
-#define KERNEL_FILM_CONVERT_PROC(name) \
- ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
-
-#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
- KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
- (const KernelFilmConvert kfilm_convert, \
- float *pixels, \
- float *render_buffer, \
- int num_pixels, \
- int width, \
- int offset, \
- int stride, \
- int rgba_offset, \
- int rgba_stride) \
+ *out = half_pixel;
+#endif
+}
+
+#ifdef __KERNEL_METAL__
+
+/* Fetch into a local variable on Metal - there is minimal overhead. Templating the
+ * film_get_pass_pixel_... functions works on MSL, but not on other compilers. */
+# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
+ float local_pixel[4]; \
+ film_get_pass_pixel_##variant(&kfilm_convert, buffer, local_pixel); \
+ if (input_channel_count >= 1) { \
+ pixel[0] = local_pixel[0]; \
+ } \
+ if (input_channel_count >= 2) { \
+ pixel[1] = local_pixel[1]; \
+ } \
+ if (input_channel_count >= 3) { \
+ pixel[2] = local_pixel[2]; \
+ } \
+ if (input_channel_count >= 4) { \
+ pixel[3] = local_pixel[3]; \
+ }
+
+#else
+
+# define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
+ film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel);
+
+#endif
+
+#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
+ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+ ccl_gpu_kernel_signature(film_convert_##variant, \
+ const KernelFilmConvert kfilm_convert, \
+ ccl_global float *pixels, \
+ ccl_global float *render_buffer, \
+ int num_pixels, \
+ int width, \
+ int offset, \
+ int stride, \
+ int rgba_offset, \
+ int rgba_stride) \
{ \
- kernel_gpu_film_convert_common(&kfilm_convert, \
- pixels, \
- render_buffer, \
- num_pixels, \
- width, \
- offset, \
- stride, \
- rgba_offset, \
- rgba_stride, \
- film_get_pass_pixel_##variant); \
+ const int render_pixel_index = ccl_gpu_global_id_x(); \
+ if (render_pixel_index >= num_pixels) { \
+ return; \
+ } \
+\
+ const int x = render_pixel_index % width; \
+ const int y = render_pixel_index / width; \
+\
+ ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+ y * stride * kfilm_convert.pass_stride; \
+\
+ ccl_global float *pixel = pixels + \
+ (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
+\
+ FILM_GET_PASS_PIXEL_F32(variant, input_channel_count); \
} \
- KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
- (const KernelFilmConvert kfilm_convert, \
- uchar4 *rgba, \
- float *render_buffer, \
- int num_pixels, \
- int width, \
- int offset, \
- int stride, \
- int rgba_offset, \
- int rgba_stride) \
+\
+ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+ ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
+ const KernelFilmConvert kfilm_convert, \
+ ccl_global uchar4 *rgba, \
+ ccl_global float *render_buffer, \
+ int num_pixels, \
+ int width, \
+ int offset, \
+ int stride, \
+ int rgba_offset, \
+ int rgba_stride) \
{ \
- kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
- rgba, \
- render_buffer, \
- num_pixels, \
- width, \
- offset, \
- stride, \
- rgba_offset, \
- rgba_stride, \
- film_get_pass_pixel_##variant); \
- }
-
-KERNEL_FILM_CONVERT_DEFINE(depth, value)
-KERNEL_FILM_CONVERT_DEFINE(mist, value)
-KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
-KERNEL_FILM_CONVERT_DEFINE(float, value)
-
-KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
-KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
-
-KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
-KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
-KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
-KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
-
-#undef KERNEL_FILM_CONVERT_DEFINE
-#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
-#undef KERNEL_FILM_CONVERT_PROC
+ const int render_pixel_index = ccl_gpu_global_id_x(); \
+ if (render_pixel_index >= num_pixels) { \
+ return; \
+ } \
+\
+ const int x = render_pixel_index % width; \
+ const int y = render_pixel_index / width; \
+\
+ ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+ y * stride * kfilm_convert.pass_stride; \
+\
+ float pixel[4]; \
+ film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
+\
+ if (input_channel_count == 1) { \
+ pixel[1] = pixel[2] = pixel[0]; \
+ } \
+ if (input_channel_count <= 3) { \
+ pixel[3] = 1.0f; \
+ } \
+\
+ film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+\
+ const half4 half_pixel = float4_to_half4_display( \
+ make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
+ kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); \
+ }
+
+/* 1 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(depth, 1)
+KERNEL_FILM_CONVERT_VARIANT(mist, 1)
+KERNEL_FILM_CONVERT_VARIANT(sample_count, 1)
+KERNEL_FILM_CONVERT_VARIANT(float, 1)
+
+/* 3 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
+KERNEL_FILM_CONVERT_VARIANT(float3, 3)
+
+/* 4 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(motion, 4)
+KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
+KERNEL_FILM_CONVERT_VARIANT(combined, 4)
+KERNEL_FILM_CONVERT_VARIANT(float4, 4)
+
+#undef KERNEL_FILM_CONVERT_VARIANT
/* --------------------------------------------------------------------
* Shader evaluation.
@@ -610,28 +687,46 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
/* Displacement */
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
- float4 *output,
- const int offset,
- const int work_size)
+ ccl_gpu_kernel_signature(shader_eval_displace,
+ ccl_global KernelShaderEvalInput *input,
+ ccl_global float *output,
+ const int offset,
+ const int work_size)
+{
+ int i = ccl_gpu_global_id_x();
+ if (i < work_size) {
+ ccl_gpu_kernel_call(kernel_displace_evaluate(NULL, input, output, offset + i));
+ }
+}
+
+/* Background */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ ccl_gpu_kernel_signature(shader_eval_background,
+ ccl_global KernelShaderEvalInput *input,
+ ccl_global float *output,
+ const int offset,
+ const int work_size)
{
int i = ccl_gpu_global_id_x();
if (i < work_size) {
- kernel_displace_evaluate(NULL, input, output, offset + i);
+ ccl_gpu_kernel_call(kernel_background_evaluate(NULL, input, output, offset + i));
}
}
-/* Background Shader Evaluation */
+/* Curve Shadow Transparency */
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
- float4 *output,
- const int offset,
- const int work_size)
+ ccl_gpu_kernel_signature(shader_eval_curve_shadow_transparency,
+ ccl_global KernelShaderEvalInput *input,
+ ccl_global float *output,
+ const int offset,
+ const int work_size)
{
int i = ccl_gpu_global_id_x();
if (i < work_size) {
- kernel_background_evaluate(NULL, input, output, offset + i);
+ ccl_gpu_kernel_call(
+ kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i));
}
}
@@ -640,15 +735,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_color_preprocess(float *render_buffer,
- int full_x,
- int full_y,
- int width,
- int height,
- int offset,
- int stride,
- int pass_stride,
- int pass_denoised)
+ ccl_gpu_kernel_signature(filter_color_preprocess,
+ ccl_global float *render_buffer,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int offset,
+ int stride,
+ int pass_stride,
+ int pass_denoised)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / width;
@@ -659,31 +755,34 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
- float *buffer = render_buffer + render_pixel_index * pass_stride;
+ ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
- float *color_out = buffer + pass_denoised;
+ ccl_global float *color_out = buffer + pass_denoised;
color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
- int guiding_pass_stride,
- int guiding_pass_albedo,
- int guiding_pass_normal,
- const float *render_buffer,
- int render_offset,
- int render_stride,
- int render_pass_stride,
- int render_pass_sample_count,
- int render_pass_denoising_albedo,
- int render_pass_denoising_normal,
- int full_x,
- int full_y,
- int width,
- int height,
- int num_samples)
+ ccl_gpu_kernel_signature(filter_guiding_preprocess,
+ ccl_global float *guiding_buffer,
+ int guiding_pass_stride,
+ int guiding_pass_albedo,
+ int guiding_pass_normal,
+ int guiding_pass_flow,
+ ccl_global const float *render_buffer,
+ int render_offset,
+ int render_stride,
+ int render_pass_stride,
+ int render_pass_sample_count,
+ int render_pass_denoising_albedo,
+ int render_pass_denoising_normal,
+ int render_pass_motion,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int num_samples)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / width;
@@ -694,10 +793,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t guiding_pixel_index = x + y * width;
- float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+ ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
- const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+ ccl_global const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
float pixel_scale;
if (render_pass_sample_count == PASS_UNUSED) {
@@ -711,8 +810,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
if (guiding_pass_albedo != PASS_UNUSED) {
kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
- const float *aledo_in = buffer + render_pass_denoising_albedo;
- float *albedo_out = guiding_pixel + guiding_pass_albedo;
+ ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo;
+ ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
albedo_out[0] = aledo_in[0] * pixel_scale;
albedo_out[1] = aledo_in[1] * pixel_scale;
@@ -720,24 +819,36 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
/* Normal pass. */
- if (render_pass_denoising_normal != PASS_UNUSED) {
+ if (guiding_pass_normal != PASS_UNUSED) {
kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
- const float *normal_in = buffer + render_pass_denoising_normal;
- float *normal_out = guiding_pixel + guiding_pass_normal;
+ ccl_global const float *normal_in = buffer + render_pass_denoising_normal;
+ ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
normal_out[0] = normal_in[0] * pixel_scale;
normal_out[1] = normal_in[1] * pixel_scale;
normal_out[2] = normal_in[2] * pixel_scale;
}
+
+ /* Flow pass. */
+ if (guiding_pass_flow != PASS_UNUSED) {
+ kernel_assert(render_pass_motion != PASS_UNUSED);
+
+ ccl_global const float *motion_in = buffer + render_pass_motion;
+ ccl_global float *flow_out = guiding_pixel + guiding_pass_flow;
+
+ flow_out[0] = -motion_in[0] * pixel_scale;
+ flow_out[1] = -motion_in[1] * pixel_scale;
+ }
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
- int guiding_pass_stride,
- int guiding_pass_albedo,
- int width,
- int height)
+ ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo,
+ ccl_global float *guiding_buffer,
+ int guiding_pass_stride,
+ int guiding_pass_albedo,
+ int width,
+ int height)
{
kernel_assert(guiding_pass_albedo != PASS_UNUSED);
@@ -750,9 +861,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t guiding_pixel_index = x + y * width;
- float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+ ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
- float *albedo_out = guiding_pixel + guiding_pass_albedo;
+ ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
albedo_out[0] = 0.5f;
albedo_out[1] = 0.5f;
@@ -760,20 +871,21 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_color_postprocess(float *render_buffer,
- int full_x,
- int full_y,
- int width,
- int height,
- int offset,
- int stride,
- int pass_stride,
- int num_samples,
- int pass_noisy,
- int pass_denoised,
- int pass_sample_count,
- int num_components,
- bool use_compositing)
+ ccl_gpu_kernel_signature(filter_color_postprocess,
+ ccl_global float *render_buffer,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int offset,
+ int stride,
+ int pass_stride,
+ int num_samples,
+ int pass_noisy,
+ int pass_denoised,
+ int pass_sample_count,
+ int num_components,
+ bool use_compositing)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / width;
@@ -784,7 +896,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
- float *buffer = render_buffer + render_pixel_index * pass_stride;
+ ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
float pixel_scale;
if (pass_sample_count == PASS_UNUSED) {
@@ -794,7 +906,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
pixel_scale = __float_as_uint(buffer[pass_sample_count]);
}
- float *denoised_pixel = buffer + pass_denoised;
+ ccl_global float *denoised_pixel = buffer + pass_denoised;
denoised_pixel[0] *= pixel_scale;
denoised_pixel[1] *= pixel_scale;
@@ -807,13 +919,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
/* Currently compositing passes are either 3-component (derived by dividing light passes)
* or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
* simplifies logic and avoids extra memory allocation. */
- const float *noisy_pixel = buffer + pass_noisy;
+ ccl_global const float *noisy_pixel = buffer + pass_noisy;
denoised_pixel[3] = noisy_pixel[3];
}
else {
/* Assigning to zero since this is a default alpha value for 3-component passes, and it
* is an opaque pixel for 4 component passes. */
-
denoised_pixel[3] = 0;
}
}
@@ -823,21 +934,22 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
- uint *num_possible_splits)
+ ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits,
+ int num_states,
+ ccl_global uint *num_possible_splits)
{
const int state = ccl_gpu_global_id_x();
bool can_split = false;
if (state < num_states) {
- can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+ can_split = ccl_gpu_kernel_call(kernel_shadow_catcher_path_can_split(nullptr, state));
}
/* NOTE: All threads specified in the mask must execute the intrinsic. */
- const uint can_split_mask = ccl_gpu_ballot(can_split);
+ const auto can_split_mask = ccl_gpu_ballot(can_split);
const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
if (lane_id == 0) {
- atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+ atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask));
}
}
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index db4a4bf71e0..a5320edcb3c 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN
*
* Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */
-#include "util/util_atomic.h"
+#include "util/atomic.h"
#ifdef __HIP__
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
@@ -31,10 +31,43 @@ CCL_NAMESPACE_BEGIN
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
#endif
+#ifdef __KERNEL_METAL__
+struct ActiveIndexContext {
+ ActiveIndexContext(int _thread_index,
+ int _global_index,
+ int _threadgroup_size,
+ int _simdgroup_size,
+ int _simd_lane_index,
+ int _simd_group_index,
+ int _num_simd_groups,
+ threadgroup int *_simdgroup_offset)
+ : thread_index(_thread_index),
+ global_index(_global_index),
+ blocksize(_threadgroup_size),
+ ccl_gpu_warp_size(_simdgroup_size),
+ thread_warp(_simd_lane_index),
+ warp_index(_simd_group_index),
+ num_warps(_num_simd_groups),
+ warp_offset(_simdgroup_offset)
+ {
+ }
+
+ const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
+ num_warps;
+ threadgroup int *warp_offset;
+
+ template<uint blocksizeDummy, typename IsActiveOp>
+ void active_index_array(const uint num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ IsActiveOp is_active_op)
+ {
+ const uint state_index = global_index;
+#else
template<uint blocksize, typename IsActiveOp>
__device__ void gpu_parallel_active_index_array(const uint num_states,
- int *indices,
- int *num_indices,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
IsActiveOp is_active_op)
{
extern ccl_gpu_shared int warp_offset[];
@@ -45,43 +78,62 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
const uint warp_index = thread_index / ccl_gpu_warp_size;
const uint num_warps = blocksize / ccl_gpu_warp_size;
- /* Test if state corresponding to this thread is active. */
const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
- const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#endif
- /* For each thread within a warp compute how many other active states precede it. */
- const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
- const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+ /* Test if state corresponding to this thread is active. */
+ const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
- /* Last thread in warp stores number of active states for each warp. */
- if (thread_warp == ccl_gpu_warp_size - 1) {
- warp_offset[warp_index] = thread_offset + is_active;
- }
+ /* For each thread within a warp compute how many other active states precede it. */
+ const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
+ ccl_gpu_thread_mask(thread_warp));
- ccl_gpu_syncthreads();
-
- /* Last thread in block converts per-warp sizes to offsets, increments global size of
- * index array and gets offset to write to. */
- if (thread_index == blocksize - 1) {
- /* TODO: parallelize this. */
- int offset = 0;
- for (int i = 0; i < num_warps; i++) {
- int num_active = warp_offset[i];
- warp_offset[i] = offset;
- offset += num_active;
+ /* Last thread in warp stores number of active states for each warp. */
+ if (thread_warp == ccl_gpu_warp_size - 1) {
+ warp_offset[warp_index] = thread_offset + is_active;
}
- const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
- warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
- }
+ ccl_gpu_syncthreads();
+
+ /* Last thread in block converts per-warp sizes to offsets, increments global size of
+ * index array and gets offset to write to. */
+ if (thread_index == blocksize - 1) {
+ /* TODO: parallelize this. */
+ int offset = 0;
+ for (int i = 0; i < num_warps; i++) {
+ int num_active = warp_offset[i];
+ warp_offset[i] = offset;
+ offset += num_active;
+ }
+
+ const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+ warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+ }
- ccl_gpu_syncthreads();
+ ccl_gpu_syncthreads();
- /* Write to index array. */
- if (is_active) {
- const uint block_offset = warp_offset[num_warps];
- indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+ /* Write to index array. */
+ if (is_active) {
+ const uint block_offset = warp_offset[num_warps];
+ indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+ }
}
-}
+
+#ifdef __KERNEL_METAL__
+}; /* end class ActiveIndexContext */
+
+/* inject the required thread params into a struct, and redirect to its templated member function
+ */
+# define gpu_parallel_active_index_array \
+ ActiveIndexContext(metal_local_id, \
+ metal_global_id, \
+ metal_local_size, \
+ simdgroup_size, \
+ simd_lane_index, \
+ simd_group_index, \
+ num_simd_groups, \
+ simdgroup_offset) \
+ .active_index_array
+#endif
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index a1349e82efb..4bd002c27e4 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN
* This is used for an array the size of the number of shaders in the scene
* which is not usually huge, so might not be a significant bottleneck. */
-#include "util/util_atomic.h"
+#include "util/atomic.h"
#ifdef __HIP__
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
@@ -33,16 +33,20 @@ CCL_NAMESPACE_BEGIN
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
#endif
-template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+__device__ void gpu_parallel_prefix_sum(const int global_id,
+ ccl_global int *counter,
+ ccl_global int *prefix_sum,
+ const int num_values)
{
- if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+ if (global_id != 0) {
return;
}
int offset = 0;
for (int i = 0; i < num_values; i++) {
- const int new_offset = offset + values[i];
- values[i] = offset;
+ const int new_offset = offset + counter[i];
+ prefix_sum[i] = offset;
+ counter[i] = 0;
offset = new_offset;
}
}
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index 9bca1fad22f..c092e2a21ee 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -24,7 +24,7 @@ CCL_NAMESPACE_BEGIN
*
* TODO: there may be ways to optimize this to avoid this many atomic ops? */
-#include "util/util_atomic.h"
+#include "util/atomic.h"
#ifdef __HIP__
# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
@@ -33,20 +33,30 @@ CCL_NAMESPACE_BEGIN
#endif
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
-template<uint blocksize, typename GetKeyOp>
-__device__ void gpu_parallel_sorted_index_array(const uint num_states,
- int *indices,
- int *num_indices,
- int *key_prefix_sum,
+template<typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint state_index,
+ const uint num_states,
+ const int num_states_limit,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ ccl_global int *key_counter,
+ ccl_global int *key_prefix_sum,
GetKeyOp get_key_op)
{
- const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
const int key = (state_index < num_states) ? get_key_op(state_index) :
GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
- indices[index] = state_index;
+ if (index < num_states_limit) {
+ /* Assign state index. */
+ indices[index] = state_index;
+ }
+ else {
+ /* Can't process this state now, increase the counter again so that
+ * it will be handled in another iteration. */
+ atomic_fetch_and_add_uint32(&key_counter[key], 1);
+ }
}
}
diff --git a/intern/cycles/kernel/device/gpu/work_stealing.h b/intern/cycles/kernel/device/gpu/work_stealing.h
new file mode 100644
index 00000000000..c3083948057
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/work_stealing.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/*
+ * Utility functions for work stealing
+ */
+
+/* Map global work index to tile, pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
+ uint global_work_index,
+ ccl_private uint *x,
+ ccl_private uint *y,
+ ccl_private uint *sample)
+{
+ uint sample_offset, pixel_offset;
+
+ if (kernel_data.integrator.scrambling_distance < 0.9f) {
+ /* Keep threads for the same sample together. */
+ uint tile_pixels = tile->w * tile->h;
+ sample_offset = global_work_index / tile_pixels;
+ pixel_offset = global_work_index - sample_offset * tile_pixels;
+ }
+ else {
+ /* Keeping threads for the same pixel together.
+ * Appears to improve performance by a few % on CUDA and OptiX. */
+ sample_offset = global_work_index % tile->num_samples;
+ pixel_offset = global_work_index / tile->num_samples;
+ }
+
+ uint y_offset = pixel_offset / tile->w;
+ uint x_offset = pixel_offset - y_offset * tile->w;
+
+ *x = tile->x + x_offset;
+ *y = tile->y + y_offset;
+ *sample = tile->start_sample + sample_offset;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
index 95338fe7d6e..fff7a09e884 100644
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -45,14 +45,14 @@ typedef unsigned long long uint64_t;
#define ccl_device_forceinline __device__ __forceinline__
#define ccl_device_noinline __device__ __noinline__
#define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
#define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
#define ccl_device_constant __constant__ __device__
#define ccl_constant const
#define ccl_gpu_shared __shared__
#define ccl_private
#define ccl_may_alias
-#define ccl_addr_space
#define ccl_restrict __restrict__
#define ccl_loop_no_unroll
#define ccl_align(n) __align__(n)
@@ -75,6 +75,7 @@ typedef unsigned long long uint64_t;
#define ccl_gpu_block_idx_x (blockIdx.x)
#define ccl_gpu_grid_dim_x (gridDim.x)
#define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
@@ -84,7 +85,6 @@ typedef unsigned long long uint64_t;
#define ccl_gpu_syncthreads() __syncthreads()
#define ccl_gpu_ballot(predicate) __ballot(predicate)
#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
-#define ccl_gpu_popc(x) __popc(x)
/* GPU texture objects */
typedef hipTextureObject_t ccl_gpu_tex_object;
@@ -117,5 +117,5 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object tex
/* Types */
-#include "util/util_half.h"
-#include "util/util_types.h"
+#include "util/half.h"
+#include "util/types.h"
diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h
index 2fde0d46015..7ec744d8ad2 100644
--- a/intern/cycles/kernel/device/hip/config.h
+++ b/intern/cycles/kernel/device/hip/config.h
@@ -35,12 +35,29 @@
/* Compute number of threads per block and minimum blocks per multiprocessor
* given the maximum number of registers per thread. */
-
#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
extern "C" __global__ void __launch_bounds__(block_num_threads, \
GPU_MULTIPRESSOR_MAX_REGISTERS / \
(block_num_threads * thread_num_registers))
+#define ccl_gpu_kernel_threads(block_num_threads) \
+ extern "C" __global__ void __launch_bounds__(block_num_threads)
+
+#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
+
+#define ccl_gpu_kernel_call(x) x
+
+/* Define a function object where "func" is the lambda body, and additional parameters are used to
+ * specify captured state */
+#define ccl_gpu_kernel_lambda(func, ...) \
+ struct KernelLambda { \
+ __VA_ARGS__; \
+ __device__ int operator()(const int state) \
+ { \
+ return (func); \
+ } \
+ } ccl_gpu_kernel_lambda_pass
+
/* sanity checks */
#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
diff --git a/intern/cycles/kernel/device/hip/globals.h b/intern/cycles/kernel/device/hip/globals.h
index 39978ae7899..d9a560d668b 100644
--- a/intern/cycles/kernel/device/hip/globals.h
+++ b/intern/cycles/kernel/device/hip/globals.h
@@ -18,24 +18,25 @@
#pragma once
-#include "kernel/kernel_profiling.h"
-#include "kernel/kernel_types.h"
+#include "kernel/types.h"
-#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/state.h"
+
+#include "kernel/util/profiling.h"
CCL_NAMESPACE_BEGIN
/* Not actually used, just a NULL pointer that gets passed everywhere, which we
* hope gets optimized out by the compiler. */
-struct KernelGlobals {
- /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
+struct KernelGlobalsGPU {
int unused[1];
};
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
/* Global scene data and textures */
__constant__ KernelData __data;
#define KERNEL_TEX(type, name) __attribute__((used)) const __constant__ __device__ type *name;
-#include "kernel/kernel_textures.h"
+#include "kernel/textures.h"
/* Integrator state */
__constant__ IntegratorStateGPU __integrator_state;
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
new file mode 100644
index 00000000000..1222b68f0ee
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -0,0 +1,316 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_METAL__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+# define ATTR_FALLTHROUGH
+#endif
+
+#include <metal_atomic>
+#include <metal_pack>
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+#ifdef __METALRT__
+using namespace metal::raytracing;
+#endif
+
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wsign-compare"
+#pragma clang diagnostic ignored "-Wuninitialized"
+
+/* Qualifiers */
+
+#define ccl_device
+#define ccl_device_inline ccl_device
+#define ccl_device_forceinline ccl_device
+#define ccl_device_noinline ccl_device __attribute__((noinline))
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
+#define ccl_global device
+#define ccl_inline_constant static constant constexpr
+#define ccl_device_constant constant
+#define ccl_constant constant
+#define ccl_gpu_shared threadgroup
+#define ccl_private thread
+#define ccl_may_alias
+#define ccl_restrict __restrict
+#define ccl_loop_no_unroll
+#define ccl_align(n) alignas(n)
+#define ccl_optional_struct_init
+
+/* No assert supported for Metal */
+
+#define kernel_assert(cond)
+
+#define ccl_gpu_global_id_x() metal_global_id
+#define ccl_gpu_warp_size simdgroup_size
+#define ccl_gpu_thread_idx_x simd_group_index
+#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)
+
+#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
+#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup);
+
+// clang-format off
+
+/* kernel.h adapters */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
+#define ccl_gpu_kernel_threads(block_num_threads)
+
+/* Convert a comma-separated list into a semicolon-separated list
+ * (so that we can generate a struct based on kernel entry-point parameters). */
+#define FN0()
+#define FN1(p1) p1;
+#define FN2(p1, p2) p1; p2;
+#define FN3(p1, p2, p3) p1; p2; p3;
+#define FN4(p1, p2, p3, p4) p1; p2; p3; p4;
+#define FN5(p1, p2, p3, p4, p5) p1; p2; p3; p4; p5;
+#define FN6(p1, p2, p3, p4, p5, p6) p1; p2; p3; p4; p5; p6;
+#define FN7(p1, p2, p3, p4, p5, p6, p7) p1; p2; p3; p4; p5; p6; p7;
+#define FN8(p1, p2, p3, p4, p5, p6, p7, p8) p1; p2; p3; p4; p5; p6; p7; p8;
+#define FN9(p1, p2, p3, p4, p5, p6, p7, p8, p9) p1; p2; p3; p4; p5; p6; p7; p8; p9;
+#define FN10(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10;
+#define FN11(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11;
+#define FN12(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12;
+#define FN13(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13;
+#define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14;
+#define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15;
+#define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16;
+#define FN17(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17;
+#define FN18(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18;
+#define FN19(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19;
+#define FN20(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19; p20;
+#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, ...) p20
+#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN20, FN19, FN18, FN17, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0)
+
+/* Generate a struct containing the entry-point parameters and a "run"
+ * method which can access them implicitly via this-> */
+#define ccl_gpu_kernel_signature(name, ...) \
+struct kernel_gpu_##name \
+{ \
+ PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
+ void run(thread MetalKernelContext& context, \
+ threadgroup int *simdgroup_offset, \
+ const uint metal_global_id, \
+ const ushort metal_local_id, \
+ const ushort metal_local_size, \
+ uint simdgroup_size, \
+ uint simd_lane_index, \
+ uint simd_group_index, \
+ uint num_simd_groups) ccl_global const; \
+}; \
+kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
+ constant KernelParamsMetal &ccl_restrict _launch_params_metal, \
+ constant MetalAncillaries *_metal_ancillaries, \
+ threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
+ const uint metal_global_id [[thread_position_in_grid]], \
+ const ushort metal_local_id [[thread_position_in_threadgroup]], \
+ const ushort metal_local_size [[threads_per_threadgroup]], \
+ uint simdgroup_size [[threads_per_simdgroup]], \
+ uint simd_lane_index [[thread_index_in_simdgroup]], \
+ uint simd_group_index [[simdgroup_index_in_threadgroup]], \
+ uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
+ MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
+ params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
+} \
+void kernel_gpu_##name::run(thread MetalKernelContext& context, \
+ threadgroup int *simdgroup_offset, \
+ const uint metal_global_id, \
+ const ushort metal_local_id, \
+ const ushort metal_local_size, \
+ uint simdgroup_size, \
+ uint simd_lane_index, \
+ uint simd_group_index, \
+ uint num_simd_groups) ccl_global const
+
+#define ccl_gpu_kernel_call(x) context.x
+
+/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state */
+#define ccl_gpu_kernel_lambda(func, ...) \
+ struct KernelLambda \
+ { \
+ KernelLambda(ccl_private MetalKernelContext &_context) : context(_context) {} \
+ ccl_private MetalKernelContext &context; \
+ __VA_ARGS__; \
+ int operator()(const int state) const { return (func); } \
+ } ccl_gpu_kernel_lambda_pass(context)
+
+// clang-format on
+
+/* volumetric lambda functions - use function objects for lambda-like functionality */
+#define VOLUME_READ_LAMBDA(function_call) \
+ struct FnObjectRead { \
+ KernelGlobals kg; \
+ ccl_private MetalKernelContext *context; \
+ int state; \
+\
+ VolumeStack operator()(const int i) const \
+ { \
+ return context->function_call; \
+ } \
+ } volume_read_lambda_pass{kg, this, state};
+
+#define VOLUME_WRITE_LAMBDA(function_call) \
+ struct FnObjectWrite { \
+ KernelGlobals kg; \
+ ccl_private MetalKernelContext *context; \
+ int state; \
+\
+ void operator()(const int i, VolumeStack entry) const \
+ { \
+ context->function_call; \
+ } \
+ } volume_write_lambda_pass{kg, this, state};
+
+/* make_type definitions with Metal style element initializers */
+#ifdef make_float2
+# undef make_float2
+#endif
+#ifdef make_float3
+# undef make_float3
+#endif
+#ifdef make_float4
+# undef make_float4
+#endif
+#ifdef make_int2
+# undef make_int2
+#endif
+#ifdef make_int3
+# undef make_int3
+#endif
+#ifdef make_int4
+# undef make_int4
+#endif
+#ifdef make_uchar4
+# undef make_uchar4
+#endif
+
+#define make_float2(x, y) float2(x, y)
+#define make_float3(x, y, z) float3(x, y, z)
+#define make_float4(x, y, z, w) float4(x, y, z, w)
+#define make_int2(x, y) int2(x, y)
+#define make_int3(x, y, z) int3(x, y, z)
+#define make_int4(x, y, z, w) int4(x, y, z, w)
+#define make_uchar4(x, y, z, w) uchar4(x, y, z, w)
+
+/* Math functions */
+
+#define __uint_as_float(x) as_type<float>(x)
+#define __float_as_uint(x) as_type<uint>(x)
+#define __int_as_float(x) as_type<float>(x)
+#define __float_as_int(x) as_type<int>(x)
+#define __float2half(x) half(x)
+#define powf(x, y) pow(float(x), float(y))
+#define fabsf(x) fabs(float(x))
+#define copysignf(x, y) copysign(float(x), float(y))
+#define asinf(x) asin(float(x))
+#define acosf(x) acos(float(x))
+#define atanf(x) atan(float(x))
+#define floorf(x) floor(float(x))
+#define ceilf(x) ceil(float(x))
+#define hypotf(x, y) hypot(float(x), float(y))
+#define atan2f(x, y) atan2(float(x), float(y))
+#define fmaxf(x, y) fmax(float(x), float(y))
+#define fminf(x, y) fmin(float(x), float(y))
+#define fmodf(x, y) fmod(float(x), float(y))
+#define sinhf(x) sinh(float(x))
+#define coshf(x) cosh(float(x))
+#define tanhf(x) tanh(float(x))
+#define saturatef(x) saturate(float(x))
+
+/* Use native functions with possibly lower precision for performance,
+ * no issues found so far. */
+#define trigmode fast
+#define sinf(x) trigmode::sin(float(x))
+#define cosf(x) trigmode::cos(float(x))
+#define tanf(x) trigmode::tan(float(x))
+#define expf(x) trigmode::exp(float(x))
+#define sqrtf(x) trigmode::sqrt(float(x))
+#define logf(x) trigmode::log(float(x))
+
+#define NULL 0
+
+#define __device__
+
+#ifdef __METALRT__
+
+# define __KERNEL_GPU_RAYTRACING__
+
+# if defined(__METALRT_MOTION__)
+# define METALRT_TAGS instancing, instance_motion, primitive_motion
+# else
+# define METALRT_TAGS instancing
+# endif /* __METALRT_MOTION__ */
+
+typedef acceleration_structure<METALRT_TAGS> metalrt_as_type;
+typedef intersection_function_table<triangle_data, METALRT_TAGS> metalrt_ift_type;
+typedef metal::raytracing::intersector<triangle_data, METALRT_TAGS> metalrt_intersector_type;
+
+#endif /* __METALRT__ */
+
+/* texture bindings and sampler setup */
+
+struct Texture2DParamsMetal {
+ texture2d<float, access::sample> tex;
+};
+struct Texture3DParamsMetal {
+ texture3d<float, access::sample> tex;
+};
+
+struct MetalAncillaries {
+ device Texture2DParamsMetal *textures_2d;
+ device Texture3DParamsMetal *textures_3d;
+
+#ifdef __METALRT__
+ metalrt_as_type accel_struct;
+ metalrt_ift_type ift_default;
+ metalrt_ift_type ift_shadow;
+ metalrt_ift_type ift_local;
+#endif
+};
+
+#include "util/half.h"
+#include "util/types.h"
+
+enum SamplerType {
+ SamplerFilterNearest_AddressRepeat,
+ SamplerFilterNearest_AddressClampEdge,
+ SamplerFilterNearest_AddressClampZero,
+
+ SamplerFilterLinear_AddressRepeat,
+ SamplerFilterLinear_AddressClampEdge,
+ SamplerFilterLinear_AddressClampZero,
+
+ SamplerCount
+};
+
+constant constexpr array<sampler, SamplerCount> metal_samplers = {
+ sampler(address::repeat, filter::nearest),
+ sampler(address::clamp_to_edge, filter::nearest),
+ sampler(address::clamp_to_zero, filter::nearest),
+ sampler(address::repeat, filter::linear),
+ sampler(address::clamp_to_edge, filter::linear),
+ sampler(address::clamp_to_zero, filter::linear),
+};
diff --git a/intern/cycles/kernel/device/metal/context_begin.h b/intern/cycles/kernel/device/metal/context_begin.h
new file mode 100644
index 00000000000..2eefd795aa1
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/context_begin.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// clang-format off
+
+/* Open the Metal kernel context class
+ * Necessary to access resource bindings */
+class MetalKernelContext {
+ public:
+ constant KernelParamsMetal &launch_params_metal;
+ constant MetalAncillaries *metal_ancillaries;
+
+ MetalKernelContext(constant KernelParamsMetal &_launch_params_metal, constant MetalAncillaries * _metal_ancillaries)
+ : launch_params_metal(_launch_params_metal), metal_ancillaries(_metal_ancillaries)
+ {}
+
+ MetalKernelContext(constant KernelParamsMetal &_launch_params_metal)
+ : launch_params_metal(_launch_params_metal)
+ {}
+
+ /* texture fetch adapter functions */
+ typedef uint64_t ccl_gpu_tex_object;
+
+ template<typename T>
+ inline __attribute__((__always_inline__))
+ T ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+ kernel_assert(0);
+ return 0;
+ }
+ template<typename T>
+ inline __attribute__((__always_inline__))
+ T ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+ kernel_assert(0);
+ return 0;
+ }
+
+ // texture2d
+ template<>
+ inline __attribute__((__always_inline__))
+ float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+ const uint tid(tex);
+ const uint sid(tex >> 32);
+ return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y));
+ }
+ template<>
+ inline __attribute__((__always_inline__))
+ float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+ const uint tid(tex);
+ const uint sid(tex >> 32);
+ return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
+ }
+
+ // texture3d
+ template<>
+ inline __attribute__((__always_inline__))
+ float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+ const uint tid(tex);
+ const uint sid(tex >> 32);
+ return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z));
+ }
+ template<>
+ inline __attribute__((__always_inline__))
+ float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+ const uint tid(tex);
+ const uint sid(tex >> 32);
+ return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x;
+ }
+# include "kernel/device/gpu/image.h"
+
+ // clang-format on
diff --git a/intern/cycles/kernel/device/metal/context_end.h b/intern/cycles/kernel/device/metal/context_end.h
new file mode 100644
index 00000000000..e700f294440
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/context_end.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+}
+; /* end of MetalKernelContext class definition */
+
+/* Silently redirect into the MetalKernelContext instance */
+/* NOTE: These macros will need maintaining as entry-points change. */
+
+#undef kernel_integrator_state
+#define kernel_integrator_state context.launch_params_metal.__integrator_state
diff --git a/intern/cycles/kernel/device/metal/globals.h b/intern/cycles/kernel/device/metal/globals.h
new file mode 100644
index 00000000000..1aea36589d0
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/globals.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#include "kernel/types.h"
+#include "kernel/util/profiling.h"
+
+#include "kernel/integrator/state.h"
+
+CCL_NAMESPACE_BEGIN
+
+typedef struct KernelParamsMetal {
+
+#define KERNEL_TEX(type, name) ccl_global const type *name;
+#include "kernel/textures.h"
+#undef KERNEL_TEX
+
+ const IntegratorStateGPU __integrator_state;
+ const KernelData data;
+
+} KernelParamsMetal;
+
+typedef struct KernelGlobalsGPU {
+ int unused[1];
+} KernelGlobalsGPU;
+
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
+
+#define kernel_data launch_params_metal.data
+#define kernel_integrator_state launch_params_metal.__integrator_state
+
+/* data lookup defines */
+
+#define kernel_tex_fetch(tex, index) launch_params_metal.tex[index]
+#define kernel_tex_array(tex) launch_params_metal.tex
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal
new file mode 100644
index 00000000000..6b77940660f
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -0,0 +1,765 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Metal kernel entry points */
+
+#include "kernel/device/metal/compat.h"
+#include "kernel/device/metal/globals.h"
+#include "kernel/device/gpu/kernel.h"
+
+/* MetalRT intersection handlers */
+#ifdef __METALRT__
+
+/* Return type for a bounding box intersection function. */
+struct BoundingBoxIntersectionResult
+{
+ bool accept [[accept_intersection]];
+ bool continue_search [[continue_search]];
+ float distance [[distance]];
+};
+
+/* Return type for a triangle intersection function. */
+struct TriangleIntersectionResult
+{
+ bool accept [[accept_intersection]];
+ bool continue_search [[continue_search]];
+};
+
+enum { METALRT_HIT_TRIANGLE, METALRT_HIT_BOUNDING_BOX };
+
+ccl_device_inline bool intersection_skip_self(ray_data const RaySelfPrimitives& self,
+ const int object,
+ const int prim)
+{
+ return (self.prim == prim) && (self.object == object);
+}
+
+ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimitives& self,
+ const int object,
+ const int prim)
+{
+ return ((self.prim == prim) && (self.object == object)) ||
+ ((self.light_prim == prim) && (self.light_object == object));
+}
+
+ccl_device_inline bool intersection_skip_self_local(ray_data const RaySelfPrimitives& self,
+ const int prim)
+{
+ return (self.prim == prim);
+}
+
+template<typename TReturn, uint intersection_type>
+TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
+ ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload,
+ const uint object,
+ const uint primitive_id,
+ const float2 barycentrics,
+ const float ray_tmax)
+{
+ TReturn result;
+
+#ifdef __BVH_LOCAL__
+ uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+
+ if ((object != payload.local_object) || intersection_skip_self_local(payload.self, prim)) {
+ /* Only intersect with matching object and skip self-intersecton. */
+ result.accept = false;
+ result.continue_search = true;
+ return result;
+ }
+
+ const short max_hits = payload.max_hits;
+ if (max_hits == 0) {
+ /* Special case for when no hit information is requested, just report that something was hit */
+ payload.result = true;
+ result.accept = true;
+ result.continue_search = false;
+ return result;
+ }
+
+ int hit = 0;
+ if (payload.has_lcg_state) {
+ for (short i = min(max_hits, short(payload.local_isect.num_hits)) - 1; i >= 0; --i) {
+ if (ray_tmax == payload.local_isect.hits[i].t) {
+ result.accept = false;
+ result.continue_search = true;
+ return result;
+ }
+ }
+
+ hit = payload.local_isect.num_hits++;
+
+ if (payload.local_isect.num_hits > max_hits) {
+ hit = lcg_step_uint(&payload.lcg_state) % payload.local_isect.num_hits;
+ if (hit >= max_hits) {
+ result.accept = false;
+ result.continue_search = true;
+ return result;
+ }
+ }
+ }
+ else {
+ if (payload.local_isect.num_hits && ray_tmax > payload.local_isect.hits[0].t) {
+ /* Record closest intersection only. Do not terminate ray here, since there is no guarantee about distance ordering in any-hit */
+ result.accept = false;
+ result.continue_search = true;
+ return result;
+ }
+
+ payload.local_isect.num_hits = 1;
+ }
+
+ ray_data Intersection *isect = &payload.local_isect.hits[hit];
+ isect->t = ray_tmax;
+ isect->prim = prim;
+ isect->object = object;
+ isect->type = kernel_tex_fetch(__objects, object).primitive_type;
+
+ isect->u = 1.0f - barycentrics.y - barycentrics.x;
+ isect->v = barycentrics.x;
+
+ /* Record geometric normal */
+ const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect->prim).w;
+ const float3 tri_a = float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0));
+ const float3 tri_b = float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1));
+ const float3 tri_c = float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
+ payload.local_isect.Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
+
+ /* Continue tracing (without this the trace call would return after the first hit) */
+ result.accept = false;
+ result.continue_search = true;
+ return result;
+#endif
+}
+
+[[intersection(triangle, triangle_data, METALRT_TAGS)]]
+TriangleIntersectionResult
+__anyhit__cycles_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+ ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
+ uint instance_id [[user_instance_id]],
+ uint primitive_id [[primitive_id]],
+ float2 barycentrics [[barycentric_coord]],
+ float ray_tmax [[distance]])
+{
+ return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
+ launch_params_metal, payload, instance_id, primitive_id, barycentrics, ray_tmax);
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
+{
+ /* unused function */
+ BoundingBoxIntersectionResult result;
+ result.distance = ray_tmax;
+ result.accept = false;
+ result.continue_search = false;
+ return result;
+}
+
+template<uint intersection_type>
+bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
+ ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
+ uint object,
+ uint prim,
+ const float2 barycentrics,
+ const float ray_tmax)
+{
+#ifdef __SHADOW_RECORD_ALL__
+# ifdef __VISIBILITY_FLAG__
+ const uint visibility = payload.visibility;
+ if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+ /* continue search */
+ return true;
+ }
+# endif
+
+ if (intersection_skip_self_shadow(payload.self, object, prim)) {
+ /* continue search */
+ return true;
+ }
+
+ float u = 0.0f, v = 0.0f;
+ int type = 0;
+ if (intersection_type == METALRT_HIT_TRIANGLE) {
+ u = 1.0f - barycentrics.y - barycentrics.x;
+ v = barycentrics.x;
+ type = kernel_tex_fetch(__objects, object).primitive_type;
+ }
+# ifdef __HAIR__
+ else {
+ u = barycentrics.x;
+ v = barycentrics.y;
+
+ const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+ type = segment.type;
+ prim = segment.prim;
+
+ /* Filter out curve endcaps */
+ if (u == 0.0f || u == 1.0f) {
+ /* continue search */
+ return true;
+ }
+ }
+# endif
+
+# ifndef __TRANSPARENT_SHADOWS__
+ /* No transparent shadows support compiled in, make opaque. */
+ payload.result = true;
+ /* terminate ray */
+ return false;
+# else
+ short max_hits = payload.max_hits;
+ short num_hits = payload.num_hits;
+ short num_recorded_hits = payload.num_recorded_hits;
+
+ MetalKernelContext context(launch_params_metal);
+
+ /* If no transparent shadows, all light is blocked and we can stop immediately. */
+ if (num_hits >= max_hits ||
+ !(context.intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
+ payload.result = true;
+ /* terminate ray */
+ return false;
+ }
+
+ /* Always use baked shadow transparency for curves. */
+ if (type & PRIMITIVE_CURVE) {
+ float throughput = payload.throughput;
+ throughput *= context.intersection_curve_shadow_transparency(nullptr, object, prim, u);
+ payload.throughput = throughput;
+ payload.num_hits += 1;
+
+ if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
+ /* Accept result and terminate if throughput is sufficiently low */
+ payload.result = true;
+ return false;
+ }
+ else {
+ return true;
+ }
+ }
+
+ payload.num_hits += 1;
+ payload.num_recorded_hits += 1;
+
+ uint record_index = num_recorded_hits;
+
+ const IntegratorShadowState state = payload.state;
+
+ const uint max_record_hits = min(uint(max_hits), INTEGRATOR_SHADOW_ISECT_SIZE);
+ if (record_index >= max_record_hits) {
+ /* If maximum number of hits reached, find a hit to replace. */
+ float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
+ uint max_recorded_hit = 0;
+
+ for (int i = 1; i < max_record_hits; i++) {
+ const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
+ if (isect_t > max_recorded_t) {
+ max_recorded_t = isect_t;
+ max_recorded_hit = i;
+ }
+ }
+
+ if (ray_tmax >= max_recorded_t) {
+ /* Accept hit, so that we don't consider any more hits beyond the distance of the
+ * current hit anymore. */
+ payload.result = true;
+ return true;
+ }
+
+ record_index = max_recorded_hit;
+ }
+
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u;
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v;
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = ray_tmax;
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
+
+ /* Continue tracing. */
+# endif /* __TRANSPARENT_SHADOWS__ */
+#endif /* __SHADOW_RECORD_ALL__ */
+
+ return true;
+}
+
+[[intersection(triangle, triangle_data, METALRT_TAGS)]]
+TriangleIntersectionResult
+__anyhit__cycles_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+ ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+ unsigned int object [[user_instance_id]],
+ unsigned int primitive_id [[primitive_id]],
+ float2 barycentrics [[barycentric_coord]],
+ float ray_tmax [[distance]])
+{
+ uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+
+ TriangleIntersectionResult result;
+ result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_TRIANGLE>(
+ launch_params_metal, payload, object, prim, barycentrics, ray_tmax);
+ result.accept = !result.continue_search;
+ return result;
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__anyhit__cycles_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]])
+{
+ /* unused function */
+ BoundingBoxIntersectionResult result;
+ result.distance = ray_tmax;
+ result.accept = false;
+ result.continue_search = false;
+ return result;
+}
+
+template<typename TReturnType, uint intersection_type>
+inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_params_metal,
+ ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+ const uint object,
+ const uint prim,
+ const float u)
+{
+ TReturnType result;
+
+# ifdef __HAIR__
+ if (intersection_type == METALRT_HIT_BOUNDING_BOX) {
+ /* Filter out curve endcaps. */
+ if (u == 0.0f || u == 1.0f) {
+ result.accept = false;
+ result.continue_search = true;
+ return result;
+ }
+ }
+# endif
+
+ uint visibility = payload.visibility;
+# ifdef __VISIBILITY_FLAG__
+ if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+ result.accept = false;
+ result.continue_search = true;
+ return result;
+ }
+# endif
+
+ /* Shadow ray early termination. */
+ if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+ if (intersection_skip_self_shadow(payload.self, object, prim)) {
+ result.accept = false;
+ result.continue_search = true;
+ return result;
+ }
+ else {
+ result.accept = true;
+ result.continue_search = false;
+ return result;
+ }
+ }
+ else {
+ if (intersection_skip_self(payload.self, object, prim)) {
+ result.accept = false;
+ result.continue_search = true;
+ return result;
+ }
+ }
+
+ result.accept = true;
+ result.continue_search = true;
+ return result;
+}
+
+[[intersection(triangle, triangle_data, METALRT_TAGS)]]
+TriangleIntersectionResult
+__anyhit__cycles_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+ ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+ unsigned int object [[user_instance_id]],
+ unsigned int primitive_id [[primitive_id]])
+{
+ uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+ TriangleIntersectionResult result = metalrt_visibility_test<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
+ launch_params_metal, payload, object, prim, 0.0f);
+ if (result.accept) {
+ payload.prim = prim;
+ payload.type = kernel_tex_fetch(__objects, object).primitive_type;
+ }
+ return result;
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__anyhit__cycles_metalrt_visibility_test_box(const float ray_tmax [[max_distance]])
+{
+ /* Unused function */
+ BoundingBoxIntersectionResult result;
+ result.accept = false;
+ result.continue_search = true;
+ result.distance = ray_tmax;
+ return result;
+}
+
+#ifdef __HAIR__
+ccl_device_inline
+void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal,
+ ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+ const uint object,
+ const uint prim,
+ const uint type,
+ const float3 ray_origin,
+ const float3 ray_direction,
+ float time,
+ const float ray_tmax,
+ thread BoundingBoxIntersectionResult &result)
+{
+# ifdef __VISIBILITY_FLAG__
+ const uint visibility = payload.visibility;
+ if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+ return;
+ }
+# endif
+
+ float3 P = ray_origin;
+ float3 dir = ray_direction;
+
+ /* The direction is not normalized by default, but the curve intersection routine expects that */
+ float len;
+ dir = normalize_len(dir, &len);
+
+ Intersection isect;
+ isect.t = ray_tmax;
+ /* Transform maximum distance into object space. */
+ if (isect.t != FLT_MAX)
+ isect.t *= len;
+
+ MetalKernelContext context(launch_params_metal);
+ if (context.curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+ result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>(
+ launch_params_metal, payload, object, prim, isect.u);
+ if (result.accept) {
+ result.distance = isect.t / len;
+ payload.u = isect.u;
+ payload.v = isect.v;
+ payload.prim = prim;
+ payload.type = type;
+ }
+ }
+}
+
+ccl_device_inline
+void metalrt_intersection_curve_shadow(constant KernelParamsMetal &launch_params_metal,
+ ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
+ const uint object,
+ const uint prim,
+ const uint type,
+ const float3 ray_origin,
+ const float3 ray_direction,
+ float time,
+ const float ray_tmax,
+ thread BoundingBoxIntersectionResult &result)
+{
+ const uint visibility = payload.visibility;
+
+ float3 P = ray_origin;
+ float3 dir = ray_direction;
+
+ /* The direction is not normalized by default, but the curve intersection routine expects that */
+ float len;
+ dir = normalize_len(dir, &len);
+
+ Intersection isect;
+ isect.t = ray_tmax;
+ /* Transform maximum distance into object space */
+ if (isect.t != FLT_MAX)
+ isect.t *= len;
+
+ MetalKernelContext context(launch_params_metal);
+ if (context.curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+ result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>(
+ launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
+ result.accept = !result.continue_search;
+
+ if (result.accept) {
+ result.distance = isect.t / len;
+ }
+ }
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+ ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+ const uint object [[user_instance_id]],
+ const uint primitive_id [[primitive_id]],
+ const float3 ray_origin [[origin]],
+ const float3 ray_direction [[direction]],
+ const float ray_tmax [[max_distance]])
+{
+ uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+ const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+
+ BoundingBoxIntersectionResult result;
+ result.accept = false;
+ result.continue_search = true;
+ result.distance = ray_tmax;
+
+ if (segment.type & PRIMITIVE_CURVE_RIBBON) {
+ metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+# if defined(__METALRT_MOTION__)
+ payload.time,
+# else
+ 0.0f,
+# endif
+ ray_tmax, result);
+ }
+
+ return result;
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+ ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+ const uint object [[user_instance_id]],
+ const uint primitive_id [[primitive_id]],
+ const float3 ray_origin [[origin]],
+ const float3 ray_direction [[direction]],
+ const float ray_tmax [[max_distance]])
+{
+ uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+ const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+
+ BoundingBoxIntersectionResult result;
+ result.accept = false;
+ result.continue_search = true;
+ result.distance = ray_tmax;
+
+ if (segment.type & PRIMITIVE_CURVE_RIBBON) {
+ metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+# if defined(__METALRT_MOTION__)
+ payload.time,
+# else
+ 0.0f,
+# endif
+ ray_tmax, result);
+ }
+
+ return result;
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__intersection__curve_all(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+ ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+ const uint object [[user_instance_id]],
+ const uint primitive_id [[primitive_id]],
+ const float3 ray_origin [[origin]],
+ const float3 ray_direction [[direction]],
+ const float ray_tmax [[max_distance]])
+{
+ uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+ const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+
+ BoundingBoxIntersectionResult result;
+ result.accept = false;
+ result.continue_search = true;
+ result.distance = ray_tmax;
+ metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+# if defined(__METALRT_MOTION__)
+ payload.time,
+# else
+ 0.0f,
+# endif
+ ray_tmax, result);
+
+ return result;
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__intersection__curve_all_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+ ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+ const uint object [[user_instance_id]],
+ const uint primitive_id [[primitive_id]],
+ const float3 ray_origin [[origin]],
+ const float3 ray_direction [[direction]],
+ const float ray_tmax [[max_distance]])
+{
+ uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+ const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+
+ BoundingBoxIntersectionResult result;
+ result.accept = false;
+ result.continue_search = true;
+ result.distance = ray_tmax;
+
+ metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+# if defined(__METALRT_MOTION__)
+ payload.time,
+# else
+ 0.0f,
+# endif
+ ray_tmax, result);
+
+ return result;
+}
+#endif /* __HAIR__ */
+
+#ifdef __POINTCLOUD__
+ccl_device_inline
+void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal,
+ ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+ const uint object,
+ const uint prim,
+ const uint type,
+ const float3 ray_origin,
+ const float3 ray_direction,
+ float time,
+ const float ray_tmax,
+ thread BoundingBoxIntersectionResult &result)
+{
+# ifdef __VISIBILITY_FLAG__
+ const uint visibility = payload.visibility;
+ if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+ return;
+ }
+# endif
+
+ float3 P = ray_origin;
+ float3 dir = ray_direction;
+
+ /* The direction is not normalized by default, but the point intersection routine expects that */
+ float len;
+ dir = normalize_len(dir, &len);
+
+ Intersection isect;
+ isect.t = ray_tmax;
+ /* Transform maximum distance into object space. */
+ if (isect.t != FLT_MAX)
+ isect.t *= len;
+
+ MetalKernelContext context(launch_params_metal);
+ if (context.point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+ result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>(
+ launch_params_metal, payload, object, prim, isect.u);
+ if (result.accept) {
+ result.distance = isect.t / len;
+ payload.u = isect.u;
+ payload.v = isect.v;
+ payload.prim = prim;
+ payload.type = type;
+ }
+ }
+}
+
+ccl_device_inline
+void metalrt_intersection_point_shadow(constant KernelParamsMetal &launch_params_metal,
+ ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
+ const uint object,
+ const uint prim,
+ const uint type,
+ const float3 ray_origin,
+ const float3 ray_direction,
+ float time,
+ const float ray_tmax,
+ thread BoundingBoxIntersectionResult &result)
+{
+ const uint visibility = payload.visibility;
+
+ float3 P = ray_origin;
+ float3 dir = ray_direction;
+
+ /* The direction is not normalized by default, but the point intersection routine expects that */
+ float len;
+ dir = normalize_len(dir, &len);
+
+ Intersection isect;
+ isect.t = ray_tmax;
+ /* Transform maximum distance into object space */
+ if (isect.t != FLT_MAX)
+ isect.t *= len;
+
+ MetalKernelContext context(launch_params_metal);
+ if (context.point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+ result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>(
+ launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
+ result.accept = !result.continue_search;
+
+ if (result.accept) {
+ result.distance = isect.t / len;
+ }
+ }
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__intersection__point(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+ ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+ const uint object [[user_instance_id]],
+ const uint primitive_id [[primitive_id]],
+ const float3 ray_origin [[origin]],
+ const float3 ray_direction [[direction]],
+ const float ray_tmax [[max_distance]])
+{
+ const uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+ const int type = kernel_tex_fetch(__objects, object).primitive_type;
+
+ BoundingBoxIntersectionResult result;
+ result.accept = false;
+ result.continue_search = true;
+ result.distance = ray_tmax;
+
+ metalrt_intersection_point(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction,
+# if defined(__METALRT_MOTION__)
+ payload.time,
+# else
+ 0.0f,
+# endif
+ ray_tmax, result);
+
+ return result;
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__intersection__point_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+ ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+ const uint object [[user_instance_id]],
+ const uint primitive_id [[primitive_id]],
+ const float3 ray_origin [[origin]],
+ const float3 ray_direction [[direction]],
+ const float ray_tmax [[max_distance]])
+{
+ const uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+ const int type = kernel_tex_fetch(__objects, object).primitive_type;
+
+ BoundingBoxIntersectionResult result;
+ result.accept = false;
+ result.continue_search = true;
+ result.distance = ray_tmax;
+
+ metalrt_intersection_point_shadow(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction,
+# if defined(__METALRT_MOTION__)
+ payload.time,
+# else
+ 0.0f,
+# endif
+ ray_tmax, result);
+
+ return result;
+}
+#endif /* __POINTCLOUD__ */
+#endif /* __METALRT__ */
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index fb9e094b535..db4233624b9 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -21,6 +21,7 @@
#include <optix.h>
#define __KERNEL_GPU__
+#define __KERNEL_GPU_RAYTRACING__
#define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */
#define __KERNEL_OPTIX__
#define CCL_NAMESPACE_BEGIN
@@ -49,16 +50,16 @@ typedef unsigned long long uint64_t;
__device__ __forceinline__ // Function calls are bad for OptiX performance, so inline everything
#define ccl_device_inline ccl_device
#define ccl_device_forceinline ccl_device
+#define ccl_device_inline_method ccl_device
#define ccl_device_noinline __device__ __noinline__
#define ccl_device_noinline_cpu ccl_device
#define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
#define ccl_device_constant __constant__ __device__
#define ccl_constant const
#define ccl_gpu_shared __shared__
#define ccl_private
#define ccl_may_alias
-#define ccl_addr_space
#define ccl_restrict __restrict__
#define ccl_loop_no_unroll
#define ccl_align(n) __align__(n)
@@ -77,6 +78,7 @@ typedef unsigned long long uint64_t;
#define ccl_gpu_block_idx_x (blockIdx.x)
#define ccl_gpu_grid_dim_x (gridDim.x)
#define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
@@ -86,7 +88,6 @@ typedef unsigned long long uint64_t;
#define ccl_gpu_syncthreads() __syncthreads()
#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)
/* GPU texture objects */
@@ -121,7 +122,14 @@ __device__ half __float2half(const float f)
return val;
}
+__device__ float __half2float(const half h)
+{
+ float val;
+ asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
+ return val;
+}
+
/* Types */
-#include "util/util_half.h"
-#include "util/util_types.h"
+#include "util/half.h"
+#include "util/types.h"
diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h
index 7d898ed5d91..e9b72369cd5 100644
--- a/intern/cycles/kernel/device/optix/globals.h
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -18,18 +18,20 @@
#pragma once
-#include "kernel/kernel_profiling.h"
-#include "kernel/kernel_types.h"
+#include "kernel/types.h"
-#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/state.h"
+
+#include "kernel/util/profiling.h"
CCL_NAMESPACE_BEGIN
/* Not actually used, just a NULL pointer that gets passed everywhere, which we
* hope gets optimized out by the compiler. */
-struct KernelGlobals {
+struct KernelGlobalsGPU {
int unused[1];
};
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
/* Launch parameters */
struct KernelParamsOptiX {
@@ -40,7 +42,7 @@ struct KernelParamsOptiX {
/* Global scene data and textures */
KernelData data;
#define KERNEL_TEX(type, name) const type *name;
-#include "kernel/kernel_textures.h"
+#include "kernel/textures.h"
/* Integrator state */
IntegratorStateGPU __integrator_state;
diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index 7a79e0c4823..8e3d57bff8a 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -21,42 +21,44 @@
#include "kernel/device/gpu/image.h" /* Texture lookup uses normal CUDA intrinsics. */
-#include "kernel/integrator/integrator_state.h"
-#include "kernel/integrator/integrator_state_flow.h"
-#include "kernel/integrator/integrator_state_util.h"
+#include "kernel/tables.h"
-#include "kernel/integrator/integrator_intersect_closest.h"
-#include "kernel/integrator/integrator_intersect_shadow.h"
-#include "kernel/integrator/integrator_intersect_subsurface.h"
-#include "kernel/integrator/integrator_intersect_volume_stack.h"
+#include "kernel/integrator/state.h"
+#include "kernel/integrator/state_flow.h"
+#include "kernel/integrator/state_util.h"
+#include "kernel/integrator/intersect_closest.h"
+#include "kernel/integrator/intersect_shadow.h"
+#include "kernel/integrator/intersect_subsurface.h"
+#include "kernel/integrator/intersect_volume_stack.h"
// clang-format on
+#define OPTIX_DEFINE_ABI_VERSION_ONLY
+#include <optix_function_table.h>
+
template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
{
- return (T *)(((uint64_t)optixGetPayload_1() << 32) | optixGetPayload_0());
+ return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1());
}
template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
{
- return (T *)(((uint64_t)optixGetPayload_3() << 32) | optixGetPayload_2());
+ return pointer_unpack_from_uint<T>(optixGetPayload_2(), optixGetPayload_3());
+}
+
+template<typename T> ccl_device_forceinline T *get_payload_ptr_6()
+{
+ return (T *)(((uint64_t)optixGetPayload_7() << 32) | optixGetPayload_6());
}
-template<bool always = false> ccl_device_forceinline uint get_object_id()
+ccl_device_forceinline int get_object_id()
{
#ifdef __OBJECT_MOTION__
- /* Always get the the instance ID from the TLAS.
+ /* Always get the instance ID from the TLAS
* There might be a motion transform node between TLAS and BLAS which does not have one. */
- uint object = optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
+ return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
#else
- uint object = optixGetInstanceId();
+ return optixGetInstanceId();
#endif
- /* Choose between always returning object ID or only for instances. */
- if (always || (object & 1) == 0)
- /* Can just remove the low bit since instance always contains object ID. */
- return object >> 1;
- else
- /* Set to OBJECT_NONE if this is not an instanced object. */
- return OBJECT_NONE;
}
extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
@@ -64,7 +66,7 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
const int global_index = optixGetLaunchIndex().x;
const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
global_index;
- integrator_intersect_closest(nullptr, path_index);
+ integrator_intersect_closest(nullptr, path_index, __params.render_buffer);
}
extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow()
@@ -100,20 +102,26 @@ extern "C" __global__ void __miss__kernel_optix_miss()
extern "C" __global__ void __anyhit__kernel_optix_local_hit()
{
-#ifdef __HAIR__
+#if defined(__HAIR__) || defined(__POINTCLOUD__)
if (!optixIsTriangleHit()) {
- /* Ignore curves. */
+ /* Ignore curves and points. */
return optixIgnoreIntersection();
}
#endif
#ifdef __BVH_LOCAL__
- const uint object = get_object_id<true>();
+ const int object = get_object_id();
if (object != optixGetPayload_4() /* local_object */) {
/* Only intersect with matching object. */
return optixIgnoreIntersection();
}
+ const int prim = optixGetPrimitiveIndex();
+ ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+ if (intersection_skip_self_local(ray->self, prim)) {
+ return optixIgnoreIntersection();
+ }
+
const uint max_hits = optixGetPayload_5();
if (max_hits == 0) {
/* Special case for when no hit information is requested, just report that something was hit */
@@ -154,19 +162,19 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
Intersection *isect = &local_isect->hits[hit];
isect->t = optixGetRayTmax();
- isect->prim = optixGetPrimitiveIndex();
+ isect->prim = prim;
isect->object = get_object_id();
- isect->type = kernel_tex_fetch(__prim_type, isect->prim);
+ isect->type = kernel_tex_fetch(__objects, isect->object).primitive_type;
const float2 barycentrics = optixGetTriangleBarycentrics();
isect->u = 1.0f - barycentrics.y - barycentrics.x;
isect->v = barycentrics.x;
/* Record geometric normal. */
- const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
- const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0));
- const float3 tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1));
- const float3 tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2));
+ const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
+ const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0);
+ const float3 tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1);
+ const float3 tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
/* Continue tracing (without this the trace call would return after the first hit). */
@@ -177,167 +185,239 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
{
#ifdef __SHADOW_RECORD_ALL__
- bool ignore_intersection = false;
-
- const uint prim = optixGetPrimitiveIndex();
+ int prim = optixGetPrimitiveIndex();
+ const uint object = get_object_id();
# ifdef __VISIBILITY_FLAG__
const uint visibility = optixGetPayload_4();
- if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
- ignore_intersection = true;
+ if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+ return optixIgnoreIntersection();
}
# endif
+ ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+ if (intersection_skip_self_shadow(ray->self, object, prim)) {
+ return optixIgnoreIntersection();
+ }
+
float u = 0.0f, v = 0.0f;
+ int type = 0;
if (optixIsTriangleHit()) {
const float2 barycentrics = optixGetTriangleBarycentrics();
u = 1.0f - barycentrics.y - barycentrics.x;
v = barycentrics.x;
+ type = kernel_tex_fetch(__objects, object).primitive_type;
}
# ifdef __HAIR__
- else {
+ else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
u = __uint_as_float(optixGetAttribute_0());
v = __uint_as_float(optixGetAttribute_1());
+ const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+ type = segment.type;
+ prim = segment.prim;
+
+# if OPTIX_ABI_VERSION < 55
/* Filter out curve endcaps. */
if (u == 0.0f || u == 1.0f) {
- ignore_intersection = true;
+ return optixIgnoreIntersection();
}
+# endif
}
# endif
+ else {
+ type = kernel_tex_fetch(__objects, object).primitive_type;
+ u = 0.0f;
+ v = 0.0f;
+ }
- int num_hits = optixGetPayload_2();
- int record_index = num_hits;
- const int max_hits = optixGetPayload_3();
+# ifndef __TRANSPARENT_SHADOWS__
+ /* No transparent shadows support compiled in, make opaque. */
+ optixSetPayload_5(true);
+ return optixTerminateRay();
+# else
+ const uint max_hits = optixGetPayload_3();
+ const uint num_hits_packed = optixGetPayload_2();
+ const uint num_recorded_hits = uint16_unpack_from_uint_0(num_hits_packed);
+ const uint num_hits = uint16_unpack_from_uint_1(num_hits_packed);
+
+ /* If no transparent shadows, all light is blocked and we can stop immediately. */
+ if (num_hits >= max_hits ||
+ !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
+ optixSetPayload_5(true);
+ return optixTerminateRay();
+ }
+
+ /* Always use baked shadow transparency for curves. */
+ if (type & PRIMITIVE_CURVE) {
+ float throughput = __uint_as_float(optixGetPayload_1());
+ throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u);
+ optixSetPayload_1(__float_as_uint(throughput));
+ optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1));
- if (!ignore_intersection) {
- optixSetPayload_2(num_hits + 1);
+ if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
+ optixSetPayload_5(true);
+ return optixTerminateRay();
+ }
+ else {
+ /* Continue tracing. */
+ optixIgnoreIntersection();
+ return;
+ }
}
- Intersection *const isect_array = get_payload_ptr_0<Intersection>();
+ /* Record transparent intersection. */
+ optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits + 1, num_hits + 1));
+
+ uint record_index = num_recorded_hits;
+
+ const IntegratorShadowState state = optixGetPayload_0();
-# ifdef __TRANSPARENT_SHADOWS__
- if (num_hits >= max_hits) {
+ const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+ if (record_index >= max_record_hits) {
/* If maximum number of hits reached, find a hit to replace. */
- const int num_recorded_hits = min(max_hits, num_hits);
- float max_recorded_t = isect_array[0].t;
- int max_recorded_hit = 0;
+ float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
+ uint max_recorded_hit = 0;
- for (int i = 1; i < num_recorded_hits; i++) {
- if (isect_array[i].t > max_recorded_t) {
- max_recorded_t = isect_array[i].t;
+ for (int i = 1; i < max_record_hits; i++) {
+ const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
+ if (isect_t > max_recorded_t) {
+ max_recorded_t = isect_t;
max_recorded_hit = i;
}
}
if (optixGetRayTmax() >= max_recorded_t) {
- /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the current
- * hit anymore. */
+ /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the
+ * current hit anymore. */
return;
}
record_index = max_recorded_hit;
}
-# endif
- if (!ignore_intersection) {
- Intersection *const isect = isect_array + record_index;
- isect->u = u;
- isect->v = v;
- isect->t = optixGetRayTmax();
- isect->prim = prim;
- isect->object = get_object_id();
- isect->type = kernel_tex_fetch(__prim_type, prim);
-
-# ifdef __TRANSPARENT_SHADOWS__
- /* Detect if this surface has a shader with transparent shadows. */
- if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) {
-# endif
- /* If no transparent shadows, all light is blocked and we can stop immediately. */
- optixSetPayload_5(true);
- return optixTerminateRay();
-# ifdef __TRANSPARENT_SHADOWS__
- }
-# endif
- }
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u;
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v;
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax();
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
/* Continue tracing. */
optixIgnoreIntersection();
-#endif
+# endif /* __TRANSPARENT_SHADOWS__ */
+#endif /* __SHADOW_RECORD_ALL__ */
}
extern "C" __global__ void __anyhit__kernel_optix_volume_test()
{
-#ifdef __HAIR__
+#if defined(__HAIR__) || defined(__POINTCLOUD__)
if (!optixIsTriangleHit()) {
/* Ignore curves. */
return optixIgnoreIntersection();
}
#endif
+ const uint object = get_object_id();
#ifdef __VISIBILITY_FLAG__
- const uint prim = optixGetPrimitiveIndex();
const uint visibility = optixGetPayload_4();
- if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
+ if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
return optixIgnoreIntersection();
}
#endif
- const uint object = get_object_id<true>();
if ((kernel_tex_fetch(__object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) {
return optixIgnoreIntersection();
}
+
+ const int prim = optixGetPrimitiveIndex();
+ ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+ if (intersection_skip_self(ray->self, object, prim)) {
+ return optixIgnoreIntersection();
+ }
}
extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
{
#ifdef __HAIR__
- if (!optixIsTriangleHit()) {
+# if OPTIX_ABI_VERSION < 55
+ if (optixGetPrimitiveType() == OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE) {
/* Filter out curve endcaps. */
const float u = __uint_as_float(optixGetAttribute_0());
if (u == 0.0f || u == 1.0f) {
return optixIgnoreIntersection();
}
}
+# endif
#endif
-#ifdef __VISIBILITY_FLAG__
- const uint prim = optixGetPrimitiveIndex();
+ const uint object = get_object_id();
const uint visibility = optixGetPayload_4();
- if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
+#ifdef __VISIBILITY_FLAG__
+ if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
return optixIgnoreIntersection();
}
+#endif
+
+ const int prim = optixGetPrimitiveIndex();
+ ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
- /* Shadow ray early termination. */
if (visibility & PATH_RAY_SHADOW_OPAQUE) {
- return optixTerminateRay();
+ if (intersection_skip_self_shadow(ray->self, object, prim)) {
+ return optixIgnoreIntersection();
+ }
+ else {
+ /* Shadow ray early termination. */
+ return optixTerminateRay();
+ }
+ }
+ else {
+ if (intersection_skip_self(ray->self, object, prim)) {
+ return optixIgnoreIntersection();
+ }
}
-#endif
}
extern "C" __global__ void __closesthit__kernel_optix_hit()
{
+ const int object = get_object_id();
+ const int prim = optixGetPrimitiveIndex();
+
optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */
- optixSetPayload_3(optixGetPrimitiveIndex());
- optixSetPayload_4(get_object_id());
- /* Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index. */
- optixSetPayload_5(kernel_tex_fetch(__prim_type, optixGetPrimitiveIndex()));
+ optixSetPayload_4(object);
if (optixIsTriangleHit()) {
const float2 barycentrics = optixGetTriangleBarycentrics();
optixSetPayload_1(__float_as_uint(1.0f - barycentrics.y - barycentrics.x));
optixSetPayload_2(__float_as_uint(barycentrics.x));
+ optixSetPayload_3(prim);
+ optixSetPayload_5(kernel_tex_fetch(__objects, object).primitive_type);
}
- else {
+ else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
+ const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
optixSetPayload_2(optixGetAttribute_1());
+ optixSetPayload_3(segment.prim);
+ optixSetPayload_5(segment.type);
+ }
+ else {
+ optixSetPayload_1(0);
+ optixSetPayload_2(0);
+ optixSetPayload_3(prim);
+ optixSetPayload_5(kernel_tex_fetch(__objects, object).primitive_type);
}
}
#ifdef __HAIR__
-ccl_device_inline void optix_intersection_curve(const uint prim, const uint type)
+ccl_device_inline void optix_intersection_curve(const int prim, const int type)
{
- const uint object = get_object_id<true>();
+ const int object = get_object_id();
+
+# ifdef __VISIBILITY_FLAG__
const uint visibility = optixGetPayload_4();
+ if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+ return;
+ }
+# endif
float3 P = optixGetObjectRayOrigin();
float3 dir = optixGetObjectRayDirection();
@@ -358,7 +438,8 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
if (isect.t != FLT_MAX)
isect.t *= len;
- if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) {
+ if (curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+ static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
optixReportIntersection(isect.t / len,
type & PRIMITIVE_ALL,
__float_as_int(isect.u), /* Attribute_0 */
@@ -368,11 +449,53 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
extern "C" __global__ void __intersection__curve_ribbon()
{
- const uint prim = optixGetPrimitiveIndex();
- const uint type = kernel_tex_fetch(__prim_type, prim);
-
- if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
+ const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, optixGetPrimitiveIndex());
+ const int prim = segment.prim;
+ const int type = segment.type;
+ if (type & PRIMITIVE_CURVE_RIBBON) {
optix_intersection_curve(prim, type);
}
}
+
+#endif
+
+#ifdef __POINTCLOUD__
+extern "C" __global__ void __intersection__point()
+{
+ const int prim = optixGetPrimitiveIndex();
+ const int object = get_object_id();
+ const int type = kernel_tex_fetch(__objects, object).primitive_type;
+
+# ifdef __VISIBILITY_FLAG__
+ const uint visibility = optixGetPayload_4();
+ if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+ return;
+ }
+# endif
+
+ float3 P = optixGetObjectRayOrigin();
+ float3 dir = optixGetObjectRayDirection();
+
+ /* The direction is not normalized by default, the point intersection routine expects that. */
+ float len;
+ dir = normalize_len(dir, &len);
+
+# ifdef __OBJECT_MOTION__
+ const float time = optixGetRayTime();
+# else
+ const float time = 0.0f;
+# endif
+
+ Intersection isect;
+ isect.t = optixGetRayTmax();
+ /* Transform maximum distance into object space. */
+ if (isect.t != FLT_MAX) {
+ isect.t *= len;
+ }
+
+ if (point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+ static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
+ optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL);
+ }
+}
#endif
diff --git a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
index bf787e29eaa..071e9deae0b 100644
--- a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
+++ b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
@@ -18,7 +18,8 @@
* much longer to compiler. This is only loaded when needed by the scene. */
#include "kernel/device/optix/kernel.cu"
-#include "kernel/integrator/integrator_shade_surface.h"
+
+#include "kernel/integrator/shade_surface.h"
extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_raytrace()
{