Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel')
-rw-r--r--intern/cycles/kernel/CMakeLists.txt42
-rw-r--r--intern/cycles/kernel/bvh/util.h2
-rw-r--r--intern/cycles/kernel/device/cpu/globals.h1
-rw-r--r--intern/cycles/kernel/device/cuda/compat.h5
-rw-r--r--intern/cycles/kernel/device/cuda/config.h19
-rw-r--r--intern/cycles/kernel/device/gpu/image.h12
-rw-r--r--intern/cycles/kernel/device/gpu/kernel.h846
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_active_index.h114
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_prefix_sum.h8
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_sorted_index.h14
-rw-r--r--intern/cycles/kernel/device/hip/compat.h5
-rw-r--r--intern/cycles/kernel/device/hip/config.h19
-rw-r--r--intern/cycles/kernel/device/metal/compat.h161
-rw-r--r--intern/cycles/kernel/device/metal/context_begin.h79
-rw-r--r--intern/cycles/kernel/device/metal/context_end.h23
-rw-r--r--intern/cycles/kernel/device/metal/globals.h51
-rw-r--r--intern/cycles/kernel/device/metal/kernel.metal25
-rw-r--r--intern/cycles/kernel/device/optix/compat.h5
-rw-r--r--intern/cycles/kernel/device/optix/kernel.cu10
-rw-r--r--intern/cycles/kernel/film/accumulate.h9
-rw-r--r--intern/cycles/kernel/geom/attribute.h13
-rw-r--r--intern/cycles/kernel/geom/curve.h12
-rw-r--r--intern/cycles/kernel/geom/motion_curve.h12
-rw-r--r--intern/cycles/kernel/geom/motion_triangle.h24
-rw-r--r--intern/cycles/kernel/geom/motion_triangle_intersect.h28
-rw-r--r--intern/cycles/kernel/geom/patch.h4
-rw-r--r--intern/cycles/kernel/geom/primitive.h33
-rw-r--r--intern/cycles/kernel/geom/subd_triangle.h49
-rw-r--r--intern/cycles/kernel/geom/triangle.h76
-rw-r--r--intern/cycles/kernel/geom/triangle_intersect.h61
-rw-r--r--intern/cycles/kernel/geom/volume.h2
-rw-r--r--intern/cycles/kernel/integrator/init_from_bake.h3
-rw-r--r--intern/cycles/kernel/integrator/init_from_camera.h3
-rw-r--r--intern/cycles/kernel/integrator/shade_background.h6
-rw-r--r--intern/cycles/kernel/integrator/shade_light.h2
-rw-r--r--intern/cycles/kernel/integrator/shade_shadow.h4
-rw-r--r--intern/cycles/kernel/integrator/shade_surface.h20
-rw-r--r--intern/cycles/kernel/integrator/shade_volume.h25
-rw-r--r--intern/cycles/kernel/integrator/shadow_state_template.h12
-rw-r--r--intern/cycles/kernel/integrator/state_template.h18
-rw-r--r--intern/cycles/kernel/integrator/volume_stack.h28
-rw-r--r--intern/cycles/kernel/light/light.h14
-rw-r--r--intern/cycles/kernel/light/sample.h33
-rw-r--r--intern/cycles/kernel/osl/CMakeLists.txt2
-rw-r--r--intern/cycles/kernel/sample/lcg.h6
-rw-r--r--intern/cycles/kernel/sample/pattern.h13
-rw-r--r--intern/cycles/kernel/svm/math_util.h27
-rw-r--r--intern/cycles/kernel/svm/svm.h2
-rw-r--r--intern/cycles/kernel/svm/wavelength.h35
-rw-r--r--intern/cycles/kernel/tables.h76
-rw-r--r--intern/cycles/kernel/textures.h7
-rw-r--r--intern/cycles/kernel/types.h23
52 files changed, 1261 insertions, 862 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 1a254f5eddc..d759399b04d 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -39,6 +39,10 @@ set(SRC_KERNEL_DEVICE_HIP
device/hip/kernel.cpp
)
+set(SRC_KERNEL_DEVICE_METAL
+ device/metal/kernel.metal
+)
+
set(SRC_KERNEL_DEVICE_OPTIX
device/optix/kernel.cu
device/optix/kernel_shader_raytrace.cu
@@ -79,6 +83,13 @@ set(SRC_KERNEL_DEVICE_OPTIX_HEADERS
device/optix/globals.h
)
+set(SRC_KERNEL_DEVICE_METAL_HEADERS
+ device/metal/compat.h
+ device/metal/context_begin.h
+ device/metal/context_end.h
+ device/metal/globals.h
+)
+
set(SRC_KERNEL_CLOSURE_HEADERS
closure/alloc.h
closure/bsdf.h
@@ -262,6 +273,7 @@ set(SRC_KERNEL_UTIL_HEADERS
)
set(SRC_KERNEL_TYPES_HEADERS
+ tables.h
textures.h
types.h
)
@@ -399,12 +411,8 @@ if(WITH_CYCLES_CUDA_BINARIES)
-I ${CMAKE_CURRENT_SOURCE_DIR}/..
-I ${CMAKE_CURRENT_SOURCE_DIR}/device/cuda
--use_fast_math
- -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file})
-
- if(${experimental})
- set(cuda_flags ${cuda_flags} -D __KERNEL_EXPERIMENTAL__)
- set(name ${name}_experimental)
- endif()
+ -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file}
+ -Wno-deprecated-gpu-targets)
if(WITH_NANOVDB)
set(cuda_flags ${cuda_flags}
@@ -412,6 +420,10 @@ if(WITH_CYCLES_CUDA_BINARIES)
-I "${NANOVDB_INCLUDE_DIR}")
endif()
+ if(WITH_CYCLES_DEBUG)
+ set(cuda_flags ${cuda_flags} -D WITH_CYCLES_DEBUG)
+ endif()
+
if(WITH_CYCLES_CUBIN_COMPILER)
string(SUBSTRING ${arch} 3 -1 CUDA_ARCH)
@@ -560,11 +572,6 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
-ffast-math
-o ${CMAKE_CURRENT_BINARY_DIR}/${hip_file})
- if(${experimental})
- set(hip_flags ${hip_flags} -D __KERNEL_EXPERIMENTAL__)
- set(name ${name}_experimental)
- endif()
-
if(WITH_NANOVDB)
set(hip_flags ${hip_flags}
-D WITH_NANOVDB
@@ -572,7 +579,7 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
endif()
if(WITH_CYCLES_DEBUG)
- set(hip_flags ${hip_flags} -D __KERNEL_DEBUG__)
+ set(hip_flags ${hip_flags} -D WITH_CYCLES_DEBUG)
endif()
add_custom_command(
@@ -613,6 +620,10 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
-I "${NANOVDB_INCLUDE_DIR}")
endif()
+ if(WITH_CYCLES_DEBUG)
+ set(cuda_flags ${cuda_flags} -D WITH_CYCLES_DEBUG)
+ endif()
+
if(WITH_CYCLES_CUBIN_COMPILER)
# Needed to find libnvrtc-builtins.so. Can't do it from inside
# cycles_cubin_cc since the env variable is read before main()
@@ -701,7 +712,7 @@ if(WITH_COMPILER_ASAN)
string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -fno-sanitize=all")
string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-sanitize=vptr")
elseif(CMAKE_C_COMPILER_ID MATCHES "Clang")
- # With OSL, Cycles disables rtti in some modules, wich then breaks at linking
+ # With OSL, Cycles disables rtti in some modules, which then breaks at linking
# when trying to use vptr sanitizer (included into 'undefined' general option).
string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -fno-sanitize=vptr")
string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-sanitize=vptr")
@@ -729,12 +740,14 @@ cycles_add_library(cycles_kernel "${LIB}"
${SRC_KERNEL_DEVICE_CUDA}
${SRC_KERNEL_DEVICE_HIP}
${SRC_KERNEL_DEVICE_OPTIX}
+ ${SRC_KERNEL_DEVICE_METAL}
${SRC_KERNEL_HEADERS}
${SRC_KERNEL_DEVICE_CPU_HEADERS}
${SRC_KERNEL_DEVICE_GPU_HEADERS}
${SRC_KERNEL_DEVICE_CUDA_HEADERS}
${SRC_KERNEL_DEVICE_HIP_HEADERS}
${SRC_KERNEL_DEVICE_OPTIX_HEADERS}
+ ${SRC_KERNEL_DEVICE_METAL_HEADERS}
)
source_group("bake" FILES ${SRC_KERNEL_BAKE_HEADERS})
@@ -746,6 +759,7 @@ source_group("device\\cuda" FILES ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_
source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS})
source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS})
source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS})
+source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS})
source_group("film" FILES ${SRC_KERNEL_FILM_HEADERS})
source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
@@ -778,6 +792,8 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP}" ${CYCLES_
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/film)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h
index 8686f887021..26ba136dd79 100644
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -97,7 +97,7 @@ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection *
swapped = false;
for (int j = 0; j < num_hits - 1; ++j) {
if (hits[j].t > hits[j + 1].t) {
- struct Intersection tmp_hit = hits[j];
+ Intersection tmp_hit = hits[j];
float3 tmp_Ng = Ng[j];
hits[j] = hits[j + 1];
Ng[j] = Ng[j + 1];
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
index dd0327b3f94..746e48b9880 100644
--- a/intern/cycles/kernel/device/cpu/globals.h
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -18,6 +18,7 @@
#pragma once
+#include "kernel/tables.h"
#include "kernel/types.h"
#include "kernel/util/profiling.h"
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index 1ee82e6eb7c..658dec102b1 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -52,8 +52,9 @@ typedef unsigned long long uint64_t;
#endif
#define ccl_device_noinline __device__ __noinline__
#define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
#define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
#define ccl_device_constant __constant__ __device__
#define ccl_constant const
#define ccl_gpu_shared __shared__
@@ -75,6 +76,7 @@ typedef unsigned long long uint64_t;
#define ccl_gpu_block_idx_x (blockIdx.x)
#define ccl_gpu_grid_dim_x (gridDim.x)
#define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
@@ -84,7 +86,6 @@ typedef unsigned long long uint64_t;
#define ccl_gpu_syncthreads() __syncthreads()
#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)
/* GPU texture objects */
diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h
index 46196dcdb51..003881d7912 100644
--- a/intern/cycles/kernel/device/cuda/config.h
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -92,12 +92,29 @@
/* Compute number of threads per block and minimum blocks per multiprocessor
* given the maximum number of registers per thread. */
-
#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
extern "C" __global__ void __launch_bounds__(block_num_threads, \
GPU_MULTIPRESSOR_MAX_REGISTERS / \
(block_num_threads * thread_num_registers))
+#define ccl_gpu_kernel_threads(block_num_threads) \
+ extern "C" __global__ void __launch_bounds__(block_num_threads)
+
+#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
+
+#define ccl_gpu_kernel_call(x) x
+
+/* Define a function object where "func" is the lambda body, and additional parameters are used to
+ * specify captured state */
+#define ccl_gpu_kernel_lambda(func, ...) \
+ struct KernelLambda { \
+ __VA_ARGS__; \
+ __device__ int operator()(const int state) \
+ { \
+ return (func); \
+ } \
+ } ccl_gpu_kernel_lambda_pass
+
/* sanity checks */
#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
index 95a37c693ae..0900a45c83d 100644
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -65,7 +65,9 @@ ccl_device float cubic_h1(float a)
/* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
template<typename T>
-ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureInfo &info,
+ float x,
+ float y)
{
ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
@@ -94,7 +96,7 @@ ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, f
/* Fast tricubic texture lookup using 8 trilinear lookups. */
template<typename T>
ccl_device_noinline T
-kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, float y, float z)
{
ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
@@ -169,7 +171,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl
template<typename T>
ccl_device_noinline T kernel_tex_image_interp_nanovdb(
- const TextureInfo &info, float x, float y, float z, uint interpolation)
+ ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
{
using namespace nanovdb;
@@ -191,7 +193,7 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb(
ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
{
- const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+ ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
/* float4, byte4, ushort4 and half4 */
const int texture_type = info.data_type;
@@ -226,7 +228,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
float3 P,
InterpolationType interp)
{
- const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+ ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
if (info.use_transform_3d) {
P = transform_point(&info.transform_3d, P);
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 56fcc38b907..22e2a61a06d 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -21,6 +21,13 @@
#include "kernel/device/gpu/parallel_sorted_index.h"
#include "kernel/device/gpu/work_stealing.h"
+/* Include constant tables before entering Metal's context class scope (context_begin.h) */
+#include "kernel/tables.h"
+
+#ifdef __KERNEL_METAL__
+# include "kernel/device/metal/context_begin.h"
+#endif
+
#include "kernel/integrator/state.h"
#include "kernel/integrator/state_flow.h"
#include "kernel/integrator/state_util.h"
@@ -40,6 +47,11 @@
#include "kernel/bake/bake.h"
#include "kernel/film/adaptive_sampling.h"
+
+#ifdef __KERNEL_METAL__
+# include "kernel/device/metal/context_end.h"
+#endif
+
#include "kernel/film/read.h"
/* --------------------------------------------------------------------
@@ -47,7 +59,7 @@
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_reset(int num_states)
+ ccl_gpu_kernel_signature(integrator_reset, int num_states)
{
const int state = ccl_gpu_global_id_x();
@@ -58,10 +70,11 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
- const int num_tiles,
- float *render_buffer,
- const int max_tile_work_size)
+ ccl_gpu_kernel_signature(integrator_init_from_camera,
+ ccl_global KernelWorkTile *tiles,
+ const int num_tiles,
+ ccl_global float *render_buffer,
+ const int max_tile_work_size)
{
const int work_index = ccl_gpu_global_id_x();
@@ -72,7 +85,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int tile_index = work_index / max_tile_work_size;
const int tile_work_index = work_index - tile_index * max_tile_work_size;
- const KernelWorkTile *tile = &tiles[tile_index];
+ ccl_global const KernelWorkTile *tile = &tiles[tile_index];
if (tile_work_index >= tile->work_size) {
return;
@@ -83,14 +96,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
uint x, y, sample;
get_work_pixel(tile, tile_work_index, &x, &y, &sample);
- integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+ ccl_gpu_kernel_call(
+ integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample));
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
- const int num_tiles,
- float *render_buffer,
- const int max_tile_work_size)
+ ccl_gpu_kernel_signature(integrator_init_from_bake,
+ ccl_global KernelWorkTile *tiles,
+ const int num_tiles,
+ ccl_global float *render_buffer,
+ const int max_tile_work_size)
{
const int work_index = ccl_gpu_global_id_x();
@@ -101,7 +116,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int tile_index = work_index / max_tile_work_size;
const int tile_work_index = work_index - tile_index * max_tile_work_size;
- const KernelWorkTile *tile = &tiles[tile_index];
+ ccl_global const KernelWorkTile *tile = &tiles[tile_index];
if (tile_work_index >= tile->work_size) {
return;
@@ -112,230 +127,264 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
uint x, y, sample;
get_work_pixel(tile, tile_work_index, &x, &y, &sample);
- integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+ ccl_gpu_kernel_call(
+ integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample));
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_closest(const int *path_index_array,
- ccl_global float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_closest,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_closest(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_shadow,
+ ccl_global const int *path_index_array,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_shadow(NULL, state);
+ ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_subsurface,
+ ccl_global const int *path_index_array,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_subsurface(NULL, state);
+ ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+ ccl_gpu_kernel_signature(integrator_intersect_volume_stack,
+ ccl_global const int *path_index_array,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_intersect_volume_stack(NULL, state);
+ ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_background(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_background,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_background(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_light(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_light,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_light(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_shadow(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_shadow,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_shadow(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_surface(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_surface,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_surface(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_surface_raytrace(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shade_volume(const int *path_index_array,
- float *render_buffer,
- const int work_size)
+ ccl_gpu_kernel_signature(integrator_shade_volume,
+ ccl_global const int *path_index_array,
+ ccl_global float *render_buffer,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (global_index < work_size) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
- integrator_shade_volume(NULL, state, render_buffer);
+ ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer));
}
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_queued_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int kernel)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_queued_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int kernel_index)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
+ int kernel_index);
+ ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [kernel](const int state) {
- return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int kernel)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int kernel_index)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
+ int kernel_index);
+ ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [kernel](const int state) {
- return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_active_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [](const int state) {
- return (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_terminated_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int indices_offset)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_terminated_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int indices_offset)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices + indices_offset, num_indices, [](const int state) {
- return (INTEGRATOR_STATE(state, path, queued_kernel) == 0);
- });
+ num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_terminated_shadow_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int indices_offset)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int indices_offset)
{
+ ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices + indices_offset, num_indices, [](const int state) {
- return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
- });
-}
-
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_sorted_paths_array(int num_states,
- int num_states_limit,
- int *indices,
- int *num_indices,
- int *key_counter,
- int *key_prefix_sum,
- int kernel)
-{
- gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states,
- num_states_limit,
- indices,
- num_indices,
- key_counter,
- key_prefix_sum,
- [kernel](const int state) {
- return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
- INTEGRATOR_STATE(state, path, shader_sort_key) :
- GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
- });
-}
-
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_compact_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int num_active_paths)
-{
+ num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_sorted_paths_array,
+ int num_states,
+ int num_states_limit,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ ccl_global int *key_counter,
+ ccl_global int *key_prefix_sum,
+ int kernel_index)
+{
+ ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
+ INTEGRATOR_STATE(state, path, shader_sort_key) :
+ GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY,
+ int kernel_index);
+ ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
+ const uint state_index = ccl_gpu_global_id_x();
+ gpu_parallel_sorted_index_array(state_index,
+ num_states,
+ num_states_limit,
+ indices,
+ num_indices,
+ key_counter,
+ key_prefix_sum,
+ ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int num_active_paths)
+{
+ ccl_gpu_kernel_lambda((state >= num_active_paths) &&
+ (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
+ int num_active_paths);
+ ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [num_active_paths](const int state) {
- return (state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_compact_states(const int *active_terminated_states,
- const int active_states_offset,
- const int terminated_states_offset,
- const int work_size)
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_states,
+ ccl_global const int *active_terminated_states,
+ const int active_states_offset,
+ const int terminated_states_offset,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
@@ -343,28 +392,32 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
const int from_state = active_terminated_states[active_states_offset + global_index];
const int to_state = active_terminated_states[terminated_states_offset + global_index];
- integrator_state_move(NULL, to_state, from_state);
+ ccl_gpu_kernel_call(integrator_state_move(NULL, to_state, from_state));
}
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_compact_shadow_paths_array(int num_states,
- int *indices,
- int *num_indices,
- int num_active_paths)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
+ int num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ int num_active_paths)
{
+ ccl_gpu_kernel_lambda((state >= num_active_paths) &&
+ (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
+ int num_active_paths);
+ ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
+
gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
- num_states, indices, num_indices, [num_active_paths](const int state) {
- return (state >= num_active_paths) &&
- (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
- });
+ num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
- kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states,
- const int active_states_offset,
- const int terminated_states_offset,
- const int work_size)
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ ccl_gpu_kernel_signature(integrator_compact_shadow_states,
+ ccl_global const int *active_terminated_states,
+ const int active_states_offset,
+ const int terminated_states_offset,
+ const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
@@ -372,15 +425,14 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
const int from_state = active_terminated_states[active_states_offset + global_index];
const int to_state = active_terminated_states[terminated_states_offset + global_index];
- integrator_shadow_state_move(NULL, to_state, from_state);
+ ccl_gpu_kernel_call(integrator_shadow_state_move(NULL, to_state, from_state));
}
}
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
- kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
+ccl_gpu_kernel_threads(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature(
+ prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values)
{
- gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(
- counter, prefix_sum, num_values);
+ gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values);
}
/* --------------------------------------------------------------------
@@ -388,16 +440,17 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLO
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
- int sx,
- int sy,
- int sw,
- int sh,
- float threshold,
- bool reset,
- int offset,
- int stride,
- uint *num_active_pixels)
+ ccl_gpu_kernel_signature(adaptive_sampling_convergence_check,
+ ccl_global float *render_buffer,
+ int sx,
+ int sy,
+ int sw,
+ int sh,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride,
+ ccl_global uint *num_active_pixels)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / sw;
@@ -406,37 +459,51 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
bool converged = true;
if (x < sw && y < sh) {
- converged = kernel_adaptive_sampling_convergence_check(
- nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+ converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check(
+ nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
}
/* NOTE: All threads specified in the mask must execute the intrinsic. */
- const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+ const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
if (lane_id == 0) {
- atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+ atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_adaptive_sampling_filter_x(
- float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+ ccl_gpu_kernel_signature(adaptive_sampling_filter_x,
+ ccl_global float *render_buffer,
+ int sx,
+ int sy,
+ int sw,
+ int sh,
+ int offset,
+ int stride)
{
const int y = ccl_gpu_global_id_x();
if (y < sh) {
- kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+ ccl_gpu_kernel_call(
+ kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
}
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_adaptive_sampling_filter_y(
- float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+ ccl_gpu_kernel_signature(adaptive_sampling_filter_y,
+ ccl_global float *render_buffer,
+ int sx,
+ int sy,
+ int sw,
+ int sh,
+ int offset,
+ int stride)
{
const int x = ccl_gpu_global_id_x();
if (x < sw) {
- kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+ ccl_gpu_kernel_call(
+ kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
}
}
@@ -445,12 +512,14 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+ ccl_gpu_kernel_signature(cryptomatte_postprocess,
+ ccl_global float *render_buffer,
+ int num_pixels)
{
const int pixel_index = ccl_gpu_global_id_x();
if (pixel_index < num_pixels) {
- kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+ ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index));
}
}
@@ -458,36 +527,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
* Film.
*/
-/* Common implementation for float destination. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
- float *pixels,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int dst_offset,
- int dst_stride,
- const Processor &processor)
-{
- const int render_pixel_index = ccl_gpu_global_id_x();
- if (render_pixel_index >= num_pixels) {
- return;
- }
-
- const int x = render_pixel_index % width;
- const int y = render_pixel_index / width;
-
- ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
- y * stride * kfilm_convert->pass_stride;
-
- ccl_global float *pixel = pixels +
- (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
-
- processor(kfilm_convert, buffer, pixel);
-}
-
ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgba,
const int rgba_offset,
const int rgba_stride,
@@ -508,177 +547,95 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb
#endif
}
-/* Common implementation for half4 destination and 4-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
- const KernelFilmConvert *kfilm_convert,
- uchar4 *rgba,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int rgba_offset,
- int rgba_stride,
- const Processor &processor)
-{
- const int render_pixel_index = ccl_gpu_global_id_x();
- if (render_pixel_index >= num_pixels) {
- return;
- }
-
- const int x = render_pixel_index % width;
- const int y = render_pixel_index / width;
-
- ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
- y * stride * kfilm_convert->pass_stride;
-
- float pixel[4];
- processor(kfilm_convert, buffer, pixel);
-
- film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
-
- const half4 half_pixel = float4_to_half4_display(
- make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
- kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel);
-}
-
-/* Common implementation for half4 destination and 3-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
- const KernelFilmConvert *kfilm_convert,
- uchar4 *rgba,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int rgba_offset,
- int rgba_stride,
- const Processor &processor)
-{
- kernel_gpu_film_convert_half_rgba_common_rgba(
- kfilm_convert,
- rgba,
- render_buffer,
- num_pixels,
- width,
- offset,
- stride,
- rgba_offset,
- rgba_stride,
- [&processor](const KernelFilmConvert *kfilm_convert,
- ccl_global const float *buffer,
- float *pixel_rgba) {
- processor(kfilm_convert, buffer, pixel_rgba);
- pixel_rgba[3] = 1.0f;
- });
-}
-
-/* Common implementation for half4 destination and single channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
- const KernelFilmConvert *kfilm_convert,
- uchar4 *rgba,
- float *render_buffer,
- int num_pixels,
- int width,
- int offset,
- int stride,
- int rgba_offset,
- int rgba_stride,
- const Processor &processor)
-{
- kernel_gpu_film_convert_half_rgba_common_rgba(
- kfilm_convert,
- rgba,
- render_buffer,
- num_pixels,
- width,
- offset,
- stride,
- rgba_offset,
- rgba_stride,
- [&processor](const KernelFilmConvert *kfilm_convert,
- ccl_global const float *buffer,
- float *pixel_rgba) {
- float value;
- processor(kfilm_convert, buffer, &value);
-
- pixel_rgba[0] = value;
- pixel_rgba[1] = value;
- pixel_rgba[2] = value;
- pixel_rgba[3] = 1.0f;
- });
-}
-
-#define KERNEL_FILM_CONVERT_PROC(name) \
- ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
-
-#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
- KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
- (const KernelFilmConvert kfilm_convert, \
- float *pixels, \
- float *render_buffer, \
- int num_pixels, \
- int width, \
- int offset, \
- int stride, \
- int rgba_offset, \
- int rgba_stride) \
+#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
+ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+ ccl_gpu_kernel_signature(film_convert_##variant, \
+ const KernelFilmConvert kfilm_convert, \
+ ccl_global float *pixels, \
+ ccl_global float *render_buffer, \
+ int num_pixels, \
+ int width, \
+ int offset, \
+ int stride, \
+ int rgba_offset, \
+ int rgba_stride) \
{ \
- kernel_gpu_film_convert_common(&kfilm_convert, \
- pixels, \
- render_buffer, \
- num_pixels, \
- width, \
- offset, \
- stride, \
- rgba_offset, \
- rgba_stride, \
- film_get_pass_pixel_##variant); \
+ const int render_pixel_index = ccl_gpu_global_id_x(); \
+ if (render_pixel_index >= num_pixels) { \
+ return; \
+ } \
+\
+ const int x = render_pixel_index % width; \
+ const int y = render_pixel_index / width; \
+\
+ ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+ y * stride * kfilm_convert.pass_stride; \
+\
+ ccl_global float *pixel = pixels + \
+ (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
+\
+ film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
} \
- KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
- (const KernelFilmConvert kfilm_convert, \
- uchar4 *rgba, \
- float *render_buffer, \
- int num_pixels, \
- int width, \
- int offset, \
- int stride, \
- int rgba_offset, \
- int rgba_stride) \
+\
+ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+ ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
+ const KernelFilmConvert kfilm_convert, \
+ ccl_global uchar4 *rgba, \
+ ccl_global float *render_buffer, \
+ int num_pixels, \
+ int width, \
+ int offset, \
+ int stride, \
+ int rgba_offset, \
+ int rgba_stride) \
{ \
- kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
- rgba, \
- render_buffer, \
- num_pixels, \
- width, \
- offset, \
- stride, \
- rgba_offset, \
- rgba_stride, \
- film_get_pass_pixel_##variant); \
- }
-
-KERNEL_FILM_CONVERT_DEFINE(depth, value)
-KERNEL_FILM_CONVERT_DEFINE(mist, value)
-KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
-KERNEL_FILM_CONVERT_DEFINE(float, value)
-
-KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
-KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
-
-KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
-KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
-KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
-KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
-
-#undef KERNEL_FILM_CONVERT_DEFINE
-#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
-#undef KERNEL_FILM_CONVERT_PROC
+ const int render_pixel_index = ccl_gpu_global_id_x(); \
+ if (render_pixel_index >= num_pixels) { \
+ return; \
+ } \
+\
+ const int x = render_pixel_index % width; \
+ const int y = render_pixel_index / width; \
+\
+ ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+ y * stride * kfilm_convert.pass_stride; \
+\
+ float pixel[4]; \
+ film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
+\
+ if (input_channel_count == 1) { \
+ pixel[1] = pixel[2] = pixel[0]; \
+ } \
+ if (input_channel_count <= 3) { \
+ pixel[3] = 1.0f; \
+ } \
+\
+ film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+\
+ const half4 half_pixel = float4_to_half4_display( \
+ make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
+ kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); \
+ }
+
+/* 1 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(depth, 1)
+KERNEL_FILM_CONVERT_VARIANT(mist, 1)
+KERNEL_FILM_CONVERT_VARIANT(sample_count, 1)
+KERNEL_FILM_CONVERT_VARIANT(float, 1)
+
+/* 3 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
+KERNEL_FILM_CONVERT_VARIANT(float3, 3)
+
+/* 4 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(motion, 4)
+KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
+KERNEL_FILM_CONVERT_VARIANT(combined, 4)
+KERNEL_FILM_CONVERT_VARIANT(float4, 4)
+
+#undef KERNEL_FILM_CONVERT_VARIANT
/* --------------------------------------------------------------------
* Shader evaluation.
@@ -687,42 +644,46 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
/* Displacement */
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
- float *output,
- const int offset,
- const int work_size)
+ ccl_gpu_kernel_signature(shader_eval_displace,
+ ccl_global KernelShaderEvalInput *input,
+ ccl_global float *output,
+ const int offset,
+ const int work_size)
{
int i = ccl_gpu_global_id_x();
if (i < work_size) {
- kernel_displace_evaluate(NULL, input, output, offset + i);
+ ccl_gpu_kernel_call(kernel_displace_evaluate(NULL, input, output, offset + i));
}
}
/* Background */
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
- float *output,
- const int offset,
- const int work_size)
+ ccl_gpu_kernel_signature(shader_eval_background,
+ ccl_global KernelShaderEvalInput *input,
+ ccl_global float *output,
+ const int offset,
+ const int work_size)
{
int i = ccl_gpu_global_id_x();
if (i < work_size) {
- kernel_background_evaluate(NULL, input, output, offset + i);
+ ccl_gpu_kernel_call(kernel_background_evaluate(NULL, input, output, offset + i));
}
}
/* Curve Shadow Transparency */
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_shader_eval_curve_shadow_transparency(KernelShaderEvalInput *input,
- float *output,
- const int offset,
- const int work_size)
+ ccl_gpu_kernel_signature(shader_eval_curve_shadow_transparency,
+ ccl_global KernelShaderEvalInput *input,
+ ccl_global float *output,
+ const int offset,
+ const int work_size)
{
int i = ccl_gpu_global_id_x();
if (i < work_size) {
- kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i);
+ ccl_gpu_kernel_call(
+ kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i));
}
}
@@ -731,15 +692,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_color_preprocess(float *render_buffer,
- int full_x,
- int full_y,
- int width,
- int height,
- int offset,
- int stride,
- int pass_stride,
- int pass_denoised)
+ ccl_gpu_kernel_signature(filter_color_preprocess,
+ ccl_global float *render_buffer,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int offset,
+ int stride,
+ int pass_stride,
+ int pass_denoised)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / width;
@@ -750,31 +712,32 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
- float *buffer = render_buffer + render_pixel_index * pass_stride;
+ ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
- float *color_out = buffer + pass_denoised;
+ ccl_global float *color_out = buffer + pass_denoised;
color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
- int guiding_pass_stride,
- int guiding_pass_albedo,
- int guiding_pass_normal,
- const float *render_buffer,
- int render_offset,
- int render_stride,
- int render_pass_stride,
- int render_pass_sample_count,
- int render_pass_denoising_albedo,
- int render_pass_denoising_normal,
- int full_x,
- int full_y,
- int width,
- int height,
- int num_samples)
+ ccl_gpu_kernel_signature(filter_guiding_preprocess,
+ ccl_global float *guiding_buffer,
+ int guiding_pass_stride,
+ int guiding_pass_albedo,
+ int guiding_pass_normal,
+ ccl_global const float *render_buffer,
+ int render_offset,
+ int render_stride,
+ int render_pass_stride,
+ int render_pass_sample_count,
+ int render_pass_denoising_albedo,
+ int render_pass_denoising_normal,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int num_samples)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / width;
@@ -785,10 +748,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t guiding_pixel_index = x + y * width;
- float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+ ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
- const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+ ccl_global const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
float pixel_scale;
if (render_pass_sample_count == PASS_UNUSED) {
@@ -802,8 +765,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
if (guiding_pass_albedo != PASS_UNUSED) {
kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
- const float *aledo_in = buffer + render_pass_denoising_albedo;
- float *albedo_out = guiding_pixel + guiding_pass_albedo;
+ ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo;
+ ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
albedo_out[0] = aledo_in[0] * pixel_scale;
albedo_out[1] = aledo_in[1] * pixel_scale;
@@ -814,8 +777,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
if (guiding_pass_normal != PASS_UNUSED) {
kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
- const float *normal_in = buffer + render_pass_denoising_normal;
- float *normal_out = guiding_pixel + guiding_pass_normal;
+ ccl_global const float *normal_in = buffer + render_pass_denoising_normal;
+ ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
normal_out[0] = normal_in[0] * pixel_scale;
normal_out[1] = normal_in[1] * pixel_scale;
@@ -824,11 +787,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
- int guiding_pass_stride,
- int guiding_pass_albedo,
- int width,
- int height)
+ ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo,
+ ccl_global float *guiding_buffer,
+ int guiding_pass_stride,
+ int guiding_pass_albedo,
+ int width,
+ int height)
{
kernel_assert(guiding_pass_albedo != PASS_UNUSED);
@@ -841,9 +805,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t guiding_pixel_index = x + y * width;
- float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+ ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
- float *albedo_out = guiding_pixel + guiding_pass_albedo;
+ ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
albedo_out[0] = 0.5f;
albedo_out[1] = 0.5f;
@@ -851,20 +815,21 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_filter_color_postprocess(float *render_buffer,
- int full_x,
- int full_y,
- int width,
- int height,
- int offset,
- int stride,
- int pass_stride,
- int num_samples,
- int pass_noisy,
- int pass_denoised,
- int pass_sample_count,
- int num_components,
- bool use_compositing)
+ ccl_gpu_kernel_signature(filter_color_postprocess,
+ ccl_global float *render_buffer,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int offset,
+ int stride,
+ int pass_stride,
+ int num_samples,
+ int pass_noisy,
+ int pass_denoised,
+ int pass_sample_count,
+ int num_components,
+ bool use_compositing)
{
const int work_index = ccl_gpu_global_id_x();
const int y = work_index / width;
@@ -875,7 +840,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
- float *buffer = render_buffer + render_pixel_index * pass_stride;
+ ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
float pixel_scale;
if (pass_sample_count == PASS_UNUSED) {
@@ -885,7 +850,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
pixel_scale = __float_as_uint(buffer[pass_sample_count]);
}
- float *denoised_pixel = buffer + pass_denoised;
+ ccl_global float *denoised_pixel = buffer + pass_denoised;
denoised_pixel[0] *= pixel_scale;
denoised_pixel[1] *= pixel_scale;
@@ -898,7 +863,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
/* Currently compositing passes are either 3-component (derived by dividing light passes)
* or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
* simplifies logic and avoids extra memory allocation. */
- const float *noisy_pixel = buffer + pass_noisy;
+ ccl_global const float *noisy_pixel = buffer + pass_noisy;
denoised_pixel[3] = noisy_pixel[3];
}
else {
@@ -914,21 +879,22 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
*/
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
- kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
- uint *num_possible_splits)
+ ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits,
+ int num_states,
+ ccl_global uint *num_possible_splits)
{
const int state = ccl_gpu_global_id_x();
bool can_split = false;
if (state < num_states) {
- can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+ can_split = ccl_gpu_kernel_call(kernel_shadow_catcher_path_can_split(nullptr, state));
}
/* NOTE: All threads specified in the mask must execute the intrinsic. */
- const uint can_split_mask = ccl_gpu_ballot(can_split);
+ const auto can_split_mask = ccl_gpu_ballot(can_split);
const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
if (lane_id == 0) {
- atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+ atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask));
}
}
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index d7416beb783..a5320edcb3c 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -31,10 +31,43 @@ CCL_NAMESPACE_BEGIN
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
#endif
+#ifdef __KERNEL_METAL__
+struct ActiveIndexContext {
+ ActiveIndexContext(int _thread_index,
+ int _global_index,
+ int _threadgroup_size,
+ int _simdgroup_size,
+ int _simd_lane_index,
+ int _simd_group_index,
+ int _num_simd_groups,
+ threadgroup int *_simdgroup_offset)
+ : thread_index(_thread_index),
+ global_index(_global_index),
+ blocksize(_threadgroup_size),
+ ccl_gpu_warp_size(_simdgroup_size),
+ thread_warp(_simd_lane_index),
+ warp_index(_simd_group_index),
+ num_warps(_num_simd_groups),
+ warp_offset(_simdgroup_offset)
+ {
+ }
+
+ const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
+ num_warps;
+ threadgroup int *warp_offset;
+
+ template<uint blocksizeDummy, typename IsActiveOp>
+ void active_index_array(const uint num_states,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ IsActiveOp is_active_op)
+ {
+ const uint state_index = global_index;
+#else
template<uint blocksize, typename IsActiveOp>
__device__ void gpu_parallel_active_index_array(const uint num_states,
- int *indices,
- int *num_indices,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
IsActiveOp is_active_op)
{
extern ccl_gpu_shared int warp_offset[];
@@ -45,43 +78,62 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
const uint warp_index = thread_index / ccl_gpu_warp_size;
const uint num_warps = blocksize / ccl_gpu_warp_size;
- /* Test if state corresponding to this thread is active. */
const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
- const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#endif
- /* For each thread within a warp compute how many other active states precede it. */
- const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
- const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+ /* Test if state corresponding to this thread is active. */
+ const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
- /* Last thread in warp stores number of active states for each warp. */
- if (thread_warp == ccl_gpu_warp_size - 1) {
- warp_offset[warp_index] = thread_offset + is_active;
- }
+ /* For each thread within a warp compute how many other active states precede it. */
+ const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
+ ccl_gpu_thread_mask(thread_warp));
- ccl_gpu_syncthreads();
-
- /* Last thread in block converts per-warp sizes to offsets, increments global size of
- * index array and gets offset to write to. */
- if (thread_index == blocksize - 1) {
- /* TODO: parallelize this. */
- int offset = 0;
- for (int i = 0; i < num_warps; i++) {
- int num_active = warp_offset[i];
- warp_offset[i] = offset;
- offset += num_active;
+ /* Last thread in warp stores number of active states for each warp. */
+ if (thread_warp == ccl_gpu_warp_size - 1) {
+ warp_offset[warp_index] = thread_offset + is_active;
}
- const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
- warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
- }
+ ccl_gpu_syncthreads();
+
+ /* Last thread in block converts per-warp sizes to offsets, increments global size of
+ * index array and gets offset to write to. */
+ if (thread_index == blocksize - 1) {
+ /* TODO: parallelize this. */
+ int offset = 0;
+ for (int i = 0; i < num_warps; i++) {
+ int num_active = warp_offset[i];
+ warp_offset[i] = offset;
+ offset += num_active;
+ }
+
+ const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+ warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+ }
- ccl_gpu_syncthreads();
+ ccl_gpu_syncthreads();
- /* Write to index array. */
- if (is_active) {
- const uint block_offset = warp_offset[num_warps];
- indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+ /* Write to index array. */
+ if (is_active) {
+ const uint block_offset = warp_offset[num_warps];
+ indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+ }
}
-}
+
+#ifdef __KERNEL_METAL__
+}; /* end class ActiveIndexContext */
+
+/* inject the required thread params into a struct, and redirect to its templated member function
+ */
+# define gpu_parallel_active_index_array \
+ ActiveIndexContext(metal_local_id, \
+ metal_global_id, \
+ metal_local_size, \
+ simdgroup_size, \
+ simd_lane_index, \
+ simd_group_index, \
+ num_simd_groups, \
+ simdgroup_offset) \
+ .active_index_array
+#endif
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index 6de3a022569..4bd002c27e4 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -33,10 +33,12 @@ CCL_NAMESPACE_BEGIN
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
#endif
-template<uint blocksize>
-__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
+__device__ void gpu_parallel_prefix_sum(const int global_id,
+ ccl_global int *counter,
+ ccl_global int *prefix_sum,
+ const int num_values)
{
- if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+ if (global_id != 0) {
return;
}
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index c06d7be444f..c092e2a21ee 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -33,16 +33,16 @@ CCL_NAMESPACE_BEGIN
#endif
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
-template<uint blocksize, typename GetKeyOp>
-__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+template<typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint state_index,
+ const uint num_states,
const int num_states_limit,
- int *indices,
- int *num_indices,
- int *key_counter,
- int *key_prefix_sum,
+ ccl_global int *indices,
+ ccl_global int *num_indices,
+ ccl_global int *key_counter,
+ ccl_global int *key_prefix_sum,
GetKeyOp get_key_op)
{
- const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
const int key = (state_index < num_states) ? get_key_op(state_index) :
GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
index 282c3eca641..fff7a09e884 100644
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -45,8 +45,9 @@ typedef unsigned long long uint64_t;
#define ccl_device_forceinline __device__ __forceinline__
#define ccl_device_noinline __device__ __noinline__
#define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
#define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
#define ccl_device_constant __constant__ __device__
#define ccl_constant const
#define ccl_gpu_shared __shared__
@@ -74,6 +75,7 @@ typedef unsigned long long uint64_t;
#define ccl_gpu_block_idx_x (blockIdx.x)
#define ccl_gpu_grid_dim_x (gridDim.x)
#define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
@@ -83,7 +85,6 @@ typedef unsigned long long uint64_t;
#define ccl_gpu_syncthreads() __syncthreads()
#define ccl_gpu_ballot(predicate) __ballot(predicate)
#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
-#define ccl_gpu_popc(x) __popc(x)
/* GPU texture objects */
typedef hipTextureObject_t ccl_gpu_tex_object;
diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h
index 2fde0d46015..7ec744d8ad2 100644
--- a/intern/cycles/kernel/device/hip/config.h
+++ b/intern/cycles/kernel/device/hip/config.h
@@ -35,12 +35,29 @@
/* Compute number of threads per block and minimum blocks per multiprocessor
* given the maximum number of registers per thread. */
-
#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
extern "C" __global__ void __launch_bounds__(block_num_threads, \
GPU_MULTIPRESSOR_MAX_REGISTERS / \
(block_num_threads * thread_num_registers))
+#define ccl_gpu_kernel_threads(block_num_threads) \
+ extern "C" __global__ void __launch_bounds__(block_num_threads)
+
+#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
+
+#define ccl_gpu_kernel_call(x) x
+
+/* Define a function object where "func" is the lambda body, and additional parameters are used to
+ * specify captured state */
+#define ccl_gpu_kernel_lambda(func, ...) \
+ struct KernelLambda { \
+ __VA_ARGS__; \
+ __device__ int operator()(const int state) \
+ { \
+ return (func); \
+ } \
+ } ccl_gpu_kernel_lambda_pass
+
/* sanity checks */
#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
index 77cea30914c..61597a4acfc 100644
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -34,6 +34,7 @@ using namespace metal;
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wsign-compare"
+#pragma clang diagnostic ignored "-Wuninitialized"
/* Qualifiers */
@@ -42,8 +43,9 @@ using namespace metal;
#define ccl_device_forceinline ccl_device
#define ccl_device_noinline ccl_device __attribute__((noinline))
#define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
#define ccl_global device
-#define ccl_static_constant static constant constexpr
+#define ccl_inline_constant static constant constexpr
#define ccl_device_constant constant
#define ccl_constant const device
#define ccl_gpu_shared threadgroup
@@ -58,6 +60,122 @@ using namespace metal;
#define kernel_assert(cond)
+#define ccl_gpu_global_id_x() metal_global_id
+#define ccl_gpu_warp_size simdgroup_size
+#define ccl_gpu_thread_idx_x simd_group_index
+#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)
+
+#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
+#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup);
+
+// clang-format off
+
+/* kernel.h adapters */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
+#define ccl_gpu_kernel_threads(block_num_threads)
+
+/* Convert a comma-separated list into a semicolon-separated list
+ * (so that we can generate a struct based on kernel entry-point parameters). */
+#define FN0()
+#define FN1(p1) p1;
+#define FN2(p1, p2) p1; p2;
+#define FN3(p1, p2, p3) p1; p2; p3;
+#define FN4(p1, p2, p3, p4) p1; p2; p3; p4;
+#define FN5(p1, p2, p3, p4, p5) p1; p2; p3; p4; p5;
+#define FN6(p1, p2, p3, p4, p5, p6) p1; p2; p3; p4; p5; p6;
+#define FN7(p1, p2, p3, p4, p5, p6, p7) p1; p2; p3; p4; p5; p6; p7;
+#define FN8(p1, p2, p3, p4, p5, p6, p7, p8) p1; p2; p3; p4; p5; p6; p7; p8;
+#define FN9(p1, p2, p3, p4, p5, p6, p7, p8, p9) p1; p2; p3; p4; p5; p6; p7; p8; p9;
+#define FN10(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10;
+#define FN11(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11;
+#define FN12(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12;
+#define FN13(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13;
+#define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14;
+#define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15;
+#define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16;
+#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, ...) p16
+#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0)
+
+/* Generate a struct containing the entry-point parameters and a "run"
+ * method which can access them implicitly via this-> */
+#define ccl_gpu_kernel_signature(name, ...) \
+struct kernel_gpu_##name \
+{ \
+ PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
+ void run(thread MetalKernelContext& context, \
+ threadgroup int *simdgroup_offset, \
+ const uint metal_global_id, \
+ const ushort metal_local_id, \
+ const ushort metal_local_size, \
+ uint simdgroup_size, \
+ uint simd_lane_index, \
+ uint simd_group_index, \
+ uint num_simd_groups) ccl_global const; \
+}; \
+kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \
+ constant KernelParamsMetal &ccl_restrict _launch_params_metal, \
+ constant MetalAncillaries *_metal_ancillaries, \
+ threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
+ const uint metal_global_id [[thread_position_in_grid]], \
+ const ushort metal_local_id [[thread_position_in_threadgroup]], \
+ const ushort metal_local_size [[threads_per_threadgroup]], \
+ uint simdgroup_size [[threads_per_simdgroup]], \
+ uint simd_lane_index [[thread_index_in_simdgroup]], \
+ uint simd_group_index [[simdgroup_index_in_threadgroup]], \
+ uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
+ MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
+ params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
+} \
+void kernel_gpu_##name::run(thread MetalKernelContext& context, \
+ threadgroup int *simdgroup_offset, \
+ const uint metal_global_id, \
+ const ushort metal_local_id, \
+ const ushort metal_local_size, \
+ uint simdgroup_size, \
+ uint simd_lane_index, \
+ uint simd_group_index, \
+ uint num_simd_groups) ccl_global const
+
+#define ccl_gpu_kernel_call(x) context.x
+
+/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state */
+#define ccl_gpu_kernel_lambda(func, ...) \
+ struct KernelLambda \
+ { \
+ KernelLambda(ccl_private MetalKernelContext &_context) : context(_context) {} \
+ ccl_private MetalKernelContext &context; \
+ __VA_ARGS__; \
+ int operator()(const int state) const { return (func); } \
+ } ccl_gpu_kernel_lambda_pass(context)
+
+// clang-format on
+
+/* volumetric lambda functions - use function objects for lambda-like functionality */
+#define VOLUME_READ_LAMBDA(function_call) \
+ struct FnObjectRead { \
+ KernelGlobals kg; \
+ ccl_private MetalKernelContext *context; \
+ int state; \
+\
+ VolumeStack operator()(const int i) const \
+ { \
+ return context->function_call; \
+ } \
+ } volume_read_lambda_pass{kg, this, state};
+
+#define VOLUME_WRITE_LAMBDA(function_call) \
+ struct FnObjectWrite { \
+ KernelGlobals kg; \
+ ccl_private MetalKernelContext *context; \
+ int state; \
+\
+ void operator()(const int i, VolumeStack entry) const \
+ { \
+ context->function_call; \
+ } \
+ } volume_write_lambda_pass{kg, this, state};
+
/* make_type definitions with Metal style element initializers */
#ifdef make_float2
# undef make_float2
@@ -112,6 +230,7 @@ using namespace metal;
#define sinhf(x) sinh(float(x))
#define coshf(x) cosh(float(x))
#define tanhf(x) tanh(float(x))
+#define saturatef(x) saturate(float(x))
/* Use native functions with possibly lower precision for performance,
* no issues found so far. */
@@ -124,3 +243,43 @@ using namespace metal;
#define logf(x) trigmode::log(float(x))
#define NULL 0
+
+#define __device__
+
+/* texture bindings and sampler setup */
+
+struct Texture2DParamsMetal {
+ texture2d<float, access::sample> tex;
+};
+struct Texture3DParamsMetal {
+ texture3d<float, access::sample> tex;
+};
+
+struct MetalAncillaries {
+ device Texture2DParamsMetal *textures_2d;
+ device Texture3DParamsMetal *textures_3d;
+};
+
+#include "util/half.h"
+#include "util/types.h"
+
+enum SamplerType {
+ SamplerFilterNearest_AddressRepeat,
+ SamplerFilterNearest_AddressClampEdge,
+ SamplerFilterNearest_AddressClampZero,
+
+ SamplerFilterLinear_AddressRepeat,
+ SamplerFilterLinear_AddressClampEdge,
+ SamplerFilterLinear_AddressClampZero,
+
+ SamplerCount
+};
+
+constant constexpr array<sampler, SamplerCount> metal_samplers = {
+ sampler(address::repeat, filter::nearest),
+ sampler(address::clamp_to_edge, filter::nearest),
+ sampler(address::clamp_to_zero, filter::nearest),
+ sampler(address::repeat, filter::linear),
+ sampler(address::clamp_to_edge, filter::linear),
+ sampler(address::clamp_to_zero, filter::linear),
+};
diff --git a/intern/cycles/kernel/device/metal/context_begin.h b/intern/cycles/kernel/device/metal/context_begin.h
new file mode 100644
index 00000000000..8c9e1c54077
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/context_begin.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// clang-format off
+
+/* Open the Metal kernel context class
+ * Necessary to access resource bindings */
+class MetalKernelContext {
+ public:
+ constant KernelParamsMetal &launch_params_metal;
+ constant MetalAncillaries *metal_ancillaries;
+
+ MetalKernelContext(constant KernelParamsMetal &_launch_params_metal, constant MetalAncillaries * _metal_ancillaries)
+ : launch_params_metal(_launch_params_metal), metal_ancillaries(_metal_ancillaries)
+ {}
+
+ /* texture fetch adapter functions */
+ typedef uint64_t ccl_gpu_tex_object;
+
+ template<typename T>
+ inline __attribute__((__always_inline__))
+ T ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+ kernel_assert(0);
+ return 0;
+ }
+ template<typename T>
+ inline __attribute__((__always_inline__))
+ T ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+ kernel_assert(0);
+ return 0;
+ }
+
+ // texture2d
+ template<>
+ inline __attribute__((__always_inline__))
+ float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+ const uint tid(tex);
+ const uint sid(tex >> 32);
+ return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y));
+ }
+ template<>
+ inline __attribute__((__always_inline__))
+ float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+ const uint tid(tex);
+ const uint sid(tex >> 32);
+ return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
+ }
+
+ // texture3d
+ template<>
+ inline __attribute__((__always_inline__))
+ float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+ const uint tid(tex);
+ const uint sid(tex >> 32);
+ return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z));
+ }
+ template<>
+ inline __attribute__((__always_inline__))
+ float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+ const uint tid(tex);
+ const uint sid(tex >> 32);
+ return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x;
+ }
+# include "kernel/device/gpu/image.h"
+
+ // clang-format on
diff --git a/intern/cycles/kernel/device/metal/context_end.h b/intern/cycles/kernel/device/metal/context_end.h
new file mode 100644
index 00000000000..e700f294440
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/context_end.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+}
+; /* end of MetalKernelContext class definition */
+
+/* Silently redirect into the MetalKernelContext instance */
+/* NOTE: These macros will need maintaining as entry-points change. */
+
+#undef kernel_integrator_state
+#define kernel_integrator_state context.launch_params_metal.__integrator_state
diff --git a/intern/cycles/kernel/device/metal/globals.h b/intern/cycles/kernel/device/metal/globals.h
new file mode 100644
index 00000000000..1aea36589d0
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/globals.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#include "kernel/types.h"
+#include "kernel/util/profiling.h"
+
+#include "kernel/integrator/state.h"
+
+CCL_NAMESPACE_BEGIN
+
+typedef struct KernelParamsMetal {
+
+#define KERNEL_TEX(type, name) ccl_global const type *name;
+#include "kernel/textures.h"
+#undef KERNEL_TEX
+
+ const IntegratorStateGPU __integrator_state;
+ const KernelData data;
+
+} KernelParamsMetal;
+
+typedef struct KernelGlobalsGPU {
+ int unused[1];
+} KernelGlobalsGPU;
+
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
+
+#define kernel_data launch_params_metal.data
+#define kernel_integrator_state launch_params_metal.__integrator_state
+
+/* data lookup defines */
+
+#define kernel_tex_fetch(tex, index) launch_params_metal.tex[index]
+#define kernel_tex_array(tex) launch_params_metal.tex
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal
new file mode 100644
index 00000000000..feca20ff475
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Metal kernel entry points */
+
+// clang-format off
+
+#include "kernel/device/metal/compat.h"
+#include "kernel/device/metal/globals.h"
+#include "kernel/device/gpu/kernel.h"
+
+// clang-format on \ No newline at end of file
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index 835e4621d47..0619c135c39 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -49,10 +49,11 @@ typedef unsigned long long uint64_t;
__device__ __forceinline__ // Function calls are bad for OptiX performance, so inline everything
#define ccl_device_inline ccl_device
#define ccl_device_forceinline ccl_device
+#define ccl_device_inline_method ccl_device
#define ccl_device_noinline __device__ __noinline__
#define ccl_device_noinline_cpu ccl_device
#define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
#define ccl_device_constant __constant__ __device__
#define ccl_constant const
#define ccl_gpu_shared __shared__
@@ -76,6 +77,7 @@ typedef unsigned long long uint64_t;
#define ccl_gpu_block_idx_x (blockIdx.x)
#define ccl_gpu_grid_dim_x (gridDim.x)
#define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
@@ -85,7 +87,6 @@ typedef unsigned long long uint64_t;
#define ccl_gpu_syncthreads() __syncthreads()
#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)
/* GPU texture objects */
diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index b987aa7a817..4feed59d018 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -21,6 +21,8 @@
#include "kernel/device/gpu/image.h" /* Texture lookup uses normal CUDA intrinsics. */
+#include "kernel/tables.h"
+
#include "kernel/integrator/state.h"
#include "kernel/integrator/state_flow.h"
#include "kernel/integrator/state_util.h"
@@ -44,7 +46,7 @@ template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
ccl_device_forceinline int get_object_id()
{
#ifdef __OBJECT_MOTION__
- /* Always get the the instance ID from the TLAS
+ /* Always get the instance ID from the TLAS
* There might be a motion transform node between TLAS and BLAS which does not have one. */
return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
#else
@@ -159,9 +161,9 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
/* Record geometric normal. */
const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
- const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0));
- const float3 tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1));
- const float3 tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
+ const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0);
+ const float3 tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1);
+ const float3 tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
/* Continue tracing (without this the trace call would return after the first hit). */
diff --git a/intern/cycles/kernel/film/accumulate.h b/intern/cycles/kernel/film/accumulate.h
index d66d7d6fb70..fb52b1cd05f 100644
--- a/intern/cycles/kernel/film/accumulate.h
+++ b/intern/cycles/kernel/film/accumulate.h
@@ -151,7 +151,8 @@ ccl_device_forceinline ccl_global float *kernel_accum_pixel_render_buffer(
ccl_device_inline int kernel_accum_sample(KernelGlobals kg,
ConstIntegratorState state,
ccl_global float *ccl_restrict render_buffer,
- int sample)
+ int sample,
+ int sample_offset)
{
if (kernel_data.film.pass_sample_count == PASS_UNUSED) {
return sample;
@@ -159,7 +160,9 @@ ccl_device_inline int kernel_accum_sample(KernelGlobals kg,
ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
- return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1);
+ return atomic_fetch_and_add_uint32(
+ (ccl_global uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
+ sample_offset;
}
ccl_device void kernel_accum_adaptive_buffer(KernelGlobals kg,
@@ -550,7 +553,7 @@ ccl_device_inline void kernel_accum_background(KernelGlobals kg,
const bool is_transparent_background_ray,
ccl_global float *ccl_restrict render_buffer)
{
- float3 contribution = INTEGRATOR_STATE(state, path, throughput) * L;
+ float3 contribution = float3(INTEGRATOR_STATE(state, path, throughput)) * L;
kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1);
ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
diff --git a/intern/cycles/kernel/geom/attribute.h b/intern/cycles/kernel/geom/attribute.h
index 848e0430caa..a7ac2bd926f 100644
--- a/intern/cycles/kernel/geom/attribute.h
+++ b/intern/cycles/kernel/geom/attribute.h
@@ -27,7 +27,12 @@ CCL_NAMESPACE_BEGIN
* Lookup of attributes is different between OSL and SVM, as OSL is ustring
* based while for SVM we use integer ids. */
-ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd);
+/* Patch index for triangle, -1 if not subdivision triangle */
+
+ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+ return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
+}
ccl_device_inline uint attribute_primitive_type(KernelGlobals kg, ccl_private const ShaderData *sd)
{
@@ -106,9 +111,9 @@ ccl_device Transform primitive_attribute_matrix(KernelGlobals kg,
{
Transform tfm;
- tfm.x = kernel_tex_fetch(__attributes_float3, desc.offset + 0);
- tfm.y = kernel_tex_fetch(__attributes_float3, desc.offset + 1);
- tfm.z = kernel_tex_fetch(__attributes_float3, desc.offset + 2);
+ tfm.x = kernel_tex_fetch(__attributes_float4, desc.offset + 0);
+ tfm.y = kernel_tex_fetch(__attributes_float4, desc.offset + 1);
+ tfm.z = kernel_tex_fetch(__attributes_float4, desc.offset + 2);
return tfm;
}
diff --git a/intern/cycles/kernel/geom/curve.h b/intern/cycles/kernel/geom/curve.h
index 7271193eef8..4b6eecf9640 100644
--- a/intern/cycles/kernel/geom/curve.h
+++ b/intern/cycles/kernel/geom/curve.h
@@ -126,8 +126,8 @@ ccl_device float3 curve_attribute_float3(KernelGlobals kg,
int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
int k1 = k0 + 1;
- float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0));
- float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1));
+ float3 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0);
+ float3 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1);
# ifdef __RAY_DIFFERENTIALS__
if (dx)
@@ -149,7 +149,7 @@ ccl_device float3 curve_attribute_float3(KernelGlobals kg,
if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim :
desc.offset;
- return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset));
+ return kernel_tex_fetch(__attributes_float3, offset);
}
else {
return make_float3(0.0f, 0.0f, 0.0f);
@@ -168,8 +168,8 @@ ccl_device float4 curve_attribute_float4(KernelGlobals kg,
int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
int k1 = k0 + 1;
- float4 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0);
- float4 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1);
+ float4 f0 = kernel_tex_fetch(__attributes_float4, desc.offset + k0);
+ float4 f1 = kernel_tex_fetch(__attributes_float4, desc.offset + k1);
# ifdef __RAY_DIFFERENTIALS__
if (dx)
@@ -191,7 +191,7 @@ ccl_device float4 curve_attribute_float4(KernelGlobals kg,
if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim :
desc.offset;
- return kernel_tex_fetch(__attributes_float3, offset);
+ return kernel_tex_fetch(__attributes_float4, offset);
}
else {
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/motion_curve.h b/intern/cycles/kernel/geom/motion_curve.h
index 2dd213d43f6..8358c94360f 100644
--- a/intern/cycles/kernel/geom/motion_curve.h
+++ b/intern/cycles/kernel/geom/motion_curve.h
@@ -48,8 +48,8 @@ ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals kg,
offset += step * numkeys;
- keys[0] = kernel_tex_fetch(__attributes_float3, offset + k0);
- keys[1] = kernel_tex_fetch(__attributes_float3, offset + k1);
+ keys[0] = kernel_tex_fetch(__attributes_float4, offset + k0);
+ keys[1] = kernel_tex_fetch(__attributes_float4, offset + k1);
}
}
@@ -106,10 +106,10 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals kg,
offset += step * numkeys;
- keys[0] = kernel_tex_fetch(__attributes_float3, offset + k0);
- keys[1] = kernel_tex_fetch(__attributes_float3, offset + k1);
- keys[2] = kernel_tex_fetch(__attributes_float3, offset + k2);
- keys[3] = kernel_tex_fetch(__attributes_float3, offset + k3);
+ keys[0] = kernel_tex_fetch(__attributes_float4, offset + k0);
+ keys[1] = kernel_tex_fetch(__attributes_float4, offset + k1);
+ keys[2] = kernel_tex_fetch(__attributes_float4, offset + k2);
+ keys[3] = kernel_tex_fetch(__attributes_float4, offset + k3);
}
}
diff --git a/intern/cycles/kernel/geom/motion_triangle.h b/intern/cycles/kernel/geom/motion_triangle.h
index 43f894938e0..62b7b630c89 100644
--- a/intern/cycles/kernel/geom/motion_triangle.h
+++ b/intern/cycles/kernel/geom/motion_triangle.h
@@ -43,9 +43,9 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg,
{
if (step == numsteps) {
/* center step: regular vertex location */
- verts[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
- verts[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
- verts[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+ verts[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+ verts[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+ verts[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
}
else {
/* center step not store in this array */
@@ -54,9 +54,9 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg,
offset += step * numverts;
- verts[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x));
- verts[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y));
- verts[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z));
+ verts[0] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x);
+ verts[1] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y);
+ verts[2] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z);
}
}
@@ -70,9 +70,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals kg,
{
if (step == numsteps) {
/* center step: regular vertex location */
- normals[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
- normals[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
- normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+ normals[0] = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+ normals[1] = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+ normals[2] = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
}
else {
/* center step is not stored in this array */
@@ -81,9 +81,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals kg,
offset += step * numverts;
- normals[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x));
- normals[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y));
- normals[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z));
+ normals[0] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x);
+ normals[1] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y);
+ normals[2] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z);
}
}
diff --git a/intern/cycles/kernel/geom/motion_triangle_intersect.h b/intern/cycles/kernel/geom/motion_triangle_intersect.h
index 256e7add21e..72ad237eeeb 100644
--- a/intern/cycles/kernel/geom/motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/motion_triangle_intersect.h
@@ -163,19 +163,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals kg,
motion_triangle_vertices(kg, fobject, prim, time, verts);
/* Ray-triangle intersection, unoptimized. */
float t, u, v;
- if (ray_triangle_intersect(P,
- dir,
- tmax,
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
- (ssef *)verts,
-#else
- verts[0],
- verts[1],
- verts[2],
-#endif
- &u,
- &v,
- &t)) {
+ if (ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
#ifdef __VISIBILITY_FLAG__
/* Visibility flag test. we do it here under the assumption
* that most triangles are culled by node flags.
@@ -229,19 +217,7 @@ ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals kg,
motion_triangle_vertices(kg, local_object, prim, time, verts);
/* Ray-triangle intersection, unoptimized. */
float t, u, v;
- if (!ray_triangle_intersect(P,
- dir,
- tmax,
-# if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
- (ssef *)verts,
-# else
- verts[0],
- verts[1],
- verts[2],
-# endif
- &u,
- &v,
- &t)) {
+ if (!ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
return false;
}
diff --git a/intern/cycles/kernel/geom/patch.h b/intern/cycles/kernel/geom/patch.h
index 7d24937a41e..432618aa243 100644
--- a/intern/cycles/kernel/geom/patch.h
+++ b/intern/cycles/kernel/geom/patch.h
@@ -380,7 +380,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals kg,
*dv = make_float3(0.0f, 0.0f, 0.0f);
for (int i = 0; i < num_control; i++) {
- float3 v = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + indices[i]));
+ float3 v = kernel_tex_fetch(__attributes_float3, offset + indices[i]);
val += v * weights[i];
if (du)
@@ -417,7 +417,7 @@ ccl_device float4 patch_eval_float4(KernelGlobals kg,
*dv = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
for (int i = 0; i < num_control; i++) {
- float4 v = kernel_tex_fetch(__attributes_float3, offset + indices[i]);
+ float4 v = kernel_tex_fetch(__attributes_float4, offset + indices[i]);
val += v * weights[i];
if (du)
diff --git a/intern/cycles/kernel/geom/primitive.h b/intern/cycles/kernel/geom/primitive.h
index 7a8921b6d6e..6d7b550d82f 100644
--- a/intern/cycles/kernel/geom/primitive.h
+++ b/intern/cycles/kernel/geom/primitive.h
@@ -284,18 +284,33 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals kg,
int numverts, numkeys;
object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);
- /* lookup attributes */
- motion_pre = primitive_surface_attribute_float3(kg, sd, desc, NULL, NULL);
-
- desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE) ? numverts : numkeys;
- motion_post = primitive_surface_attribute_float3(kg, sd, desc, NULL, NULL);
-
#ifdef __HAIR__
- if (is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
- object_position_transform(kg, sd, &motion_pre);
- object_position_transform(kg, sd, &motion_post);
+ if (is_curve_primitive) {
+ motion_pre = float4_to_float3(curve_attribute_float4(kg, sd, desc, NULL, NULL));
+ desc.offset += numkeys;
+ motion_post = float4_to_float3(curve_attribute_float4(kg, sd, desc, NULL, NULL));
+
+ /* Curve */
+ if ((sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+ object_position_transform(kg, sd, &motion_pre);
+ object_position_transform(kg, sd, &motion_post);
+ }
}
+ else
#endif
+ if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
+ /* Triangle */
+ if (subd_triangle_patch(kg, sd) == ~0) {
+ motion_pre = triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+ desc.offset += numverts;
+ motion_post = triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+ }
+ else {
+ motion_pre = subd_triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+ desc.offset += numverts;
+ motion_post = subd_triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+ }
+ }
}
/* object motion. note that depending on the mesh having motion vectors, this
diff --git a/intern/cycles/kernel/geom/subd_triangle.h b/intern/cycles/kernel/geom/subd_triangle.h
index 8a9a3f71231..1b693a915bf 100644
--- a/intern/cycles/kernel/geom/subd_triangle.h
+++ b/intern/cycles/kernel/geom/subd_triangle.h
@@ -20,13 +20,6 @@
CCL_NAMESPACE_BEGIN
-/* Patch index for triangle, -1 if not subdivision triangle */
-
-ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
- return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
-}
-
/* UV coords of triangle within patch */
ccl_device_inline void subd_triangle_patch_uv(KernelGlobals kg,
@@ -443,8 +436,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
if (dy)
*dy = make_float3(0.0f, 0.0f, 0.0f);
- return float4_to_float3(
- kernel_tex_fetch(__attributes_float3, desc.offset + subd_triangle_patch_face(kg, patch)));
+ return kernel_tex_fetch(__attributes_float3,
+ desc.offset + subd_triangle_patch_face(kg, patch));
}
else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
float2 uv[3];
@@ -452,10 +445,10 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
uint4 v = subd_triangle_patch_indices(kg, patch);
- float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.x));
- float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.y));
- float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.z));
- float3 f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.w));
+ float3 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + v.x);
+ float3 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + v.y);
+ float3 f2 = kernel_tex_fetch(__attributes_float3, desc.offset + v.z);
+ float3 f3 = kernel_tex_fetch(__attributes_float3, desc.offset + v.w);
if (subd_triangle_patch_num_corners(kg, patch) != 4) {
f1 = (f1 + f0) * 0.5f;
@@ -484,10 +477,10 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
float3 f0, f1, f2, f3;
- f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset));
- f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset));
- f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset));
- f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset));
+ f0 = kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset);
+ f1 = kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset);
+ f2 = kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset);
+ f3 = kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset);
if (subd_triangle_patch_num_corners(kg, patch) != 4) {
f1 = (f1 + f0) * 0.5f;
@@ -513,7 +506,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
if (dy)
*dy = make_float3(0.0f, 0.0f, 0.0f);
- return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset));
+ return kernel_tex_fetch(__attributes_float3, desc.offset);
}
else {
if (dx)
@@ -590,7 +583,7 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
if (dy)
*dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- return kernel_tex_fetch(__attributes_float3,
+ return kernel_tex_fetch(__attributes_float4,
desc.offset + subd_triangle_patch_face(kg, patch));
}
else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
@@ -599,10 +592,10 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
uint4 v = subd_triangle_patch_indices(kg, patch);
- float4 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + v.x);
- float4 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + v.y);
- float4 f2 = kernel_tex_fetch(__attributes_float3, desc.offset + v.z);
- float4 f3 = kernel_tex_fetch(__attributes_float3, desc.offset + v.w);
+ float4 f0 = kernel_tex_fetch(__attributes_float4, desc.offset + v.x);
+ float4 f1 = kernel_tex_fetch(__attributes_float4, desc.offset + v.y);
+ float4 f2 = kernel_tex_fetch(__attributes_float4, desc.offset + v.z);
+ float4 f3 = kernel_tex_fetch(__attributes_float4, desc.offset + v.w);
if (subd_triangle_patch_num_corners(kg, patch) != 4) {
f1 = (f1 + f0) * 0.5f;
@@ -642,10 +635,10 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, corners[3] + desc.offset)));
}
else {
- f0 = kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset);
- f1 = kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset);
- f2 = kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset);
- f3 = kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset);
+ f0 = kernel_tex_fetch(__attributes_float4, corners[0] + desc.offset);
+ f1 = kernel_tex_fetch(__attributes_float4, corners[1] + desc.offset);
+ f2 = kernel_tex_fetch(__attributes_float4, corners[2] + desc.offset);
+ f3 = kernel_tex_fetch(__attributes_float4, corners[3] + desc.offset);
}
if (subd_triangle_patch_num_corners(kg, patch) != 4) {
@@ -672,7 +665,7 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
if (dy)
*dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- return kernel_tex_fetch(__attributes_float3, desc.offset);
+ return kernel_tex_fetch(__attributes_float4, desc.offset);
}
else {
if (dx)
diff --git a/intern/cycles/kernel/geom/triangle.h b/intern/cycles/kernel/geom/triangle.h
index 233e901c7ca..854022b3369 100644
--- a/intern/cycles/kernel/geom/triangle.h
+++ b/intern/cycles/kernel/geom/triangle.h
@@ -29,9 +29,9 @@ ccl_device_inline float3 triangle_normal(KernelGlobals kg, ccl_private ShaderDat
{
/* load triangle vertices */
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
- const float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
- const float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
- const float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+ const float3 v0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+ const float3 v1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+ const float3 v2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
/* return normal */
if (sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
@@ -54,9 +54,9 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg,
{
/* load triangle vertices */
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
- float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
- float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
- float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+ float3 v0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+ float3 v1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+ float3 v2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
/* compute point */
float t = 1.0f - u - v;
*P = (u * v0 + v * v1 + t * v2);
@@ -78,9 +78,9 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg,
ccl_device_inline void triangle_vertices(KernelGlobals kg, int prim, float3 P[3])
{
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
- P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
- P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
- P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+ P[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+ P[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+ P[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
}
/* Triangle vertex locations and vertex normals */
@@ -91,12 +91,12 @@ ccl_device_inline void triangle_vertices_and_normals(KernelGlobals kg,
float3 N[3])
{
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
- P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
- P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
- P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
- N[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
- N[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
- N[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+ P[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+ P[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+ P[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
+ N[0] = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+ N[1] = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+ N[2] = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
}
/* Interpolate smooth vertex normal from vertices */
@@ -106,9 +106,9 @@ triangle_smooth_normal(KernelGlobals kg, float3 Ng, int prim, float u, float v)
{
/* load triangle vertices */
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
- float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
- float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
- float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+ float3 n0 = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+ float3 n1 = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+ float3 n2 = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
float3 N = safe_normalize((1.0f - u - v) * n2 + u * n0 + v * n1);
@@ -120,9 +120,9 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized(
{
/* load triangle vertices */
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
- float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
- float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
- float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+ float3 n0 = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+ float3 n1 = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+ float3 n2 = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
/* ensure that the normals are in object space */
if (sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED) {
@@ -145,9 +145,9 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals kg,
{
/* fetch triangle vertex coordinates */
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
- const float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
- const float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
- const float3 p2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+ const float3 p0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+ const float3 p1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+ const float3 p2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
/* compute derivatives of P w.r.t. uv */
*dPdu = (p0 - p2);
@@ -267,15 +267,15 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg,
if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
- f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x));
- f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y));
- f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z));
+ f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x);
+ f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y);
+ f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z);
}
else {
const int tri = desc.offset + sd->prim * 3;
- f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
- f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
- f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
+ f0 = kernel_tex_fetch(__attributes_float3, tri + 0);
+ f1 = kernel_tex_fetch(__attributes_float3, tri + 1);
+ f2 = kernel_tex_fetch(__attributes_float3, tri + 2);
}
#ifdef __RAY_DIFFERENTIALS__
@@ -298,7 +298,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg,
if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim :
desc.offset;
- return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset));
+ return kernel_tex_fetch(__attributes_float3, offset);
}
else {
return make_float3(0.0f, 0.0f, 0.0f);
@@ -318,16 +318,16 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg,
if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
- f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x);
- f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y);
- f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z);
+ f0 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.x);
+ f1 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.y);
+ f2 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.z);
}
else {
const int tri = desc.offset + sd->prim * 3;
if (desc.element == ATTR_ELEMENT_CORNER) {
- f0 = kernel_tex_fetch(__attributes_float3, tri + 0);
- f1 = kernel_tex_fetch(__attributes_float3, tri + 1);
- f2 = kernel_tex_fetch(__attributes_float3, tri + 2);
+ f0 = kernel_tex_fetch(__attributes_float4, tri + 0);
+ f1 = kernel_tex_fetch(__attributes_float4, tri + 1);
+ f2 = kernel_tex_fetch(__attributes_float4, tri + 2);
}
else {
f0 = color_srgb_to_linear_v4(
@@ -359,7 +359,7 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg,
if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim :
desc.offset;
- return kernel_tex_fetch(__attributes_float3, offset);
+ return kernel_tex_fetch(__attributes_float4, offset);
}
else {
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/triangle_intersect.h b/intern/cycles/kernel/geom/triangle_intersect.h
index faff8a85a93..57a6ae7fe72 100644
--- a/intern/cycles/kernel/geom/triangle_intersect.h
+++ b/intern/cycles/kernel/geom/triangle_intersect.h
@@ -37,27 +37,11 @@ ccl_device_inline bool triangle_intersect(KernelGlobals kg,
{
const int prim = kernel_tex_fetch(__prim_index, prim_addr);
const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
- const ssef *ssef_verts = (ssef *)&kg->__tri_verts.data[tri_vindex];
-#else
- const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+ const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
-#endif
float t, u, v;
- if (ray_triangle_intersect(P,
- dir,
- tmax,
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
- ssef_verts,
-#else
- float4_to_float3(tri_a),
- float4_to_float3(tri_b),
- float4_to_float3(tri_c),
-#endif
- &u,
- &v,
- &t)) {
+ if (ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
#ifdef __VISIBILITY_FLAG__
/* Visibility flag test. we do it here under the assumption
* that most triangles are culled by node flags.
@@ -106,27 +90,11 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg,
const int prim = kernel_tex_fetch(__prim_index, prim_addr);
const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
-# if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
- const ssef *ssef_verts = (ssef *)&kg->__tri_verts.data[tri_vindex];
-# else
- const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0)),
- tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1)),
- tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
-# endif
+ const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+ tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+ tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
float t, u, v;
- if (!ray_triangle_intersect(P,
- dir,
- tmax,
-# if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
- ssef_verts,
-# else
- tri_a,
- tri_b,
- tri_c,
-# endif
- &u,
- &v,
- &t)) {
+ if (!ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
return false;
}
@@ -178,11 +146,6 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg,
isect->t = t;
/* Record geometric normal. */
-# if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
- const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0)),
- tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1)),
- tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
-# endif
local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
return false;
@@ -223,9 +186,9 @@ ccl_device_inline float3 triangle_refine(KernelGlobals kg,
P = P + D * t;
const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect_prim).w;
- const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
- tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
- tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
+ const packed_float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+ tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+ tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
@@ -280,9 +243,9 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals kg,
# ifdef __INTERSECTION_REFINE__
const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect_prim).w;
- const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
- tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
- tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
+ const packed_float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+ tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+ tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
diff --git a/intern/cycles/kernel/geom/volume.h b/intern/cycles/kernel/geom/volume.h
index 4e83ad6acb3..387eb2646da 100644
--- a/intern/cycles/kernel/geom/volume.h
+++ b/intern/cycles/kernel/geom/volume.h
@@ -75,7 +75,7 @@ ccl_device float4 volume_attribute_float4(KernelGlobals kg,
const AttributeDescriptor desc)
{
if (desc.element & (ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
- return kernel_tex_fetch(__attributes_float3, desc.offset);
+ return kernel_tex_fetch(__attributes_float4, desc.offset);
}
else if (desc.element == ATTR_ELEMENT_VOXEL) {
/* todo: optimize this so we don't have to transform both here and in
diff --git a/intern/cycles/kernel/integrator/init_from_bake.h b/intern/cycles/kernel/integrator/init_from_bake.h
index dbaf02836e4..f4a2fbea405 100644
--- a/intern/cycles/kernel/integrator/init_from_bake.h
+++ b/intern/cycles/kernel/integrator/init_from_bake.h
@@ -65,7 +65,8 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
}
/* Always count the sample, even if the camera sample will reject the ray. */
- const int sample = kernel_accum_sample(kg, state, render_buffer, scheduled_sample);
+ const int sample = kernel_accum_sample(
+ kg, state, render_buffer, scheduled_sample, tile->sample_offset);
/* Setup render buffers. */
const int index = INTEGRATOR_STATE(state, path, render_pixel_index);
diff --git a/intern/cycles/kernel/integrator/init_from_camera.h b/intern/cycles/kernel/integrator/init_from_camera.h
index f0ba77bd9a6..59dd1a9fa75 100644
--- a/intern/cycles/kernel/integrator/init_from_camera.h
+++ b/intern/cycles/kernel/integrator/init_from_camera.h
@@ -89,7 +89,8 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg,
* This logic allows to both count actual number of samples per pixel, and to add samples to this
* pixel after it was converged and samples were added somewhere else (in which case the
* `scheduled_sample` will be different from actual number of samples in this pixel). */
- const int sample = kernel_accum_sample(kg, state, render_buffer, scheduled_sample);
+ const int sample = kernel_accum_sample(
+ kg, state, render_buffer, scheduled_sample, tile->sample_offset);
/* Initialize random number seed for path. */
const uint rng_hash = path_rng_hash_init(kg, sample, x, y);
diff --git a/intern/cycles/kernel/integrator/shade_background.h b/intern/cycles/kernel/integrator/shade_background.h
index 31452de1ca4..a8ebbe908ae 100644
--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -20,7 +20,6 @@
#include "kernel/integrator/shader_eval.h"
#include "kernel/light/light.h"
#include "kernel/light/sample.h"
-#include "kernel/sample/mis.h"
CCL_NAMESPACE_BEGIN
@@ -81,8 +80,7 @@ ccl_device float3 integrator_eval_background_shader(KernelGlobals kg,
/* multiple importance sampling, get background light pdf for ray
* direction, and compute weight with respect to BSDF pdf */
const float pdf = background_light_pdf(kg, ray_P - ray_D * mis_ray_t, ray_D);
- const float mis_weight = power_heuristic(mis_ray_pdf, pdf);
-
+ const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, pdf);
L *= mis_weight;
}
# endif
@@ -169,7 +167,7 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg,
/* multiple importance sampling, get regular light pdf,
* and compute weight with respect to BSDF pdf */
const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
- const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+ const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
light_eval *= mis_weight;
}
diff --git a/intern/cycles/kernel/integrator/shade_light.h b/intern/cycles/kernel/integrator/shade_light.h
index 5abe9e98abc..97ca430752c 100644
--- a/intern/cycles/kernel/integrator/shade_light.h
+++ b/intern/cycles/kernel/integrator/shade_light.h
@@ -84,7 +84,7 @@ ccl_device_inline void integrate_light(KernelGlobals kg,
/* multiple importance sampling, get regular light pdf,
* and compute weight with respect to BSDF pdf */
const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
- const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+ const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
light_eval *= mis_weight;
}
diff --git a/intern/cycles/kernel/integrator/shade_shadow.h b/intern/cycles/kernel/integrator/shade_shadow.h
index 1de890aae29..a68fcaa7a64 100644
--- a/intern/cycles/kernel/integrator/shade_shadow.h
+++ b/intern/cycles/kernel/integrator/shade_shadow.h
@@ -95,8 +95,8 @@ ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg,
shader_setup_from_volume(kg, shadow_sd, &ray);
- const float step_size = volume_stack_step_size(
- kg, [=](const int i) { return integrator_state_read_shadow_volume_stack(state, i); });
+ VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i));
+ const float step_size = volume_stack_step_size(kg, volume_read_lambda_pass);
volume_shadow_heterogeneous(kg, state, &ray, shadow_sd, throughput, step_size);
}
diff --git a/intern/cycles/kernel/integrator/shade_surface.h b/intern/cycles/kernel/integrator/shade_surface.h
index 2793dd3e218..c9c586f5ae4 100644
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -27,8 +27,6 @@
#include "kernel/light/light.h"
#include "kernel/light/sample.h"
-#include "kernel/sample/mis.h"
-
CCL_NAMESPACE_BEGIN
ccl_device_forceinline void integrate_surface_shader_setup(KernelGlobals kg,
@@ -95,8 +93,7 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,
/* Multiple importance sampling, get triangle light pdf,
* and compute weight with respect to BSDF pdf. */
float pdf = triangle_light_pdf(kg, sd, t);
- float mis_weight = power_heuristic(bsdf_pdf, pdf);
-
+ float mis_weight = light_sample_mis_weight_forward(kg, bsdf_pdf, pdf);
L *= mis_weight;
}
@@ -155,7 +152,7 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
bsdf_eval_mul3(&bsdf_eval, light_eval / ls.pdf);
if (ls.shader & SHADER_USE_MIS) {
- const float mis_weight = power_heuristic(ls.pdf, bsdf_pdf);
+ const float mis_weight = light_sample_mis_weight_nee(kg, ls.pdf, bsdf_pdf);
bsdf_eval_mul(&bsdf_eval, mis_weight);
}
@@ -195,12 +192,13 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * bsdf_eval_sum(&bsdf_eval);
if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
- const float3 pass_diffuse_weight = (bounce == 0) ?
- bsdf_eval_pass_diffuse_weight(&bsdf_eval) :
- INTEGRATOR_STATE(state, path, pass_diffuse_weight);
- const float3 pass_glossy_weight = (bounce == 0) ?
- bsdf_eval_pass_glossy_weight(&bsdf_eval) :
- INTEGRATOR_STATE(state, path, pass_glossy_weight);
+ const packed_float3 pass_diffuse_weight =
+ (bounce == 0) ? packed_float3(bsdf_eval_pass_diffuse_weight(&bsdf_eval)) :
+ INTEGRATOR_STATE(state, path, pass_diffuse_weight);
+ const packed_float3 pass_glossy_weight = (bounce == 0) ?
+ packed_float3(
+ bsdf_eval_pass_glossy_weight(&bsdf_eval)) :
+ INTEGRATOR_STATE(state, path, pass_glossy_weight);
INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = pass_glossy_weight;
}
diff --git a/intern/cycles/kernel/integrator/shade_volume.h b/intern/cycles/kernel/integrator/shade_volume.h
index cc47557d580..712c22357b8 100644
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -27,8 +27,6 @@
#include "kernel/light/light.h"
#include "kernel/light/sample.h"
-#include "kernel/sample/mis.h"
-
CCL_NAMESPACE_BEGIN
#ifdef __VOLUME__
@@ -78,9 +76,8 @@ ccl_device_inline bool shadow_volume_shader_sample(KernelGlobals kg,
ccl_private ShaderData *ccl_restrict sd,
ccl_private float3 *ccl_restrict extinction)
{
- shader_eval_volume<true>(kg, state, sd, PATH_RAY_SHADOW, [=](const int i) {
- return integrator_state_read_shadow_volume_stack(state, i);
- });
+ VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i))
+ shader_eval_volume<true>(kg, state, sd, PATH_RAY_SHADOW, volume_read_lambda_pass);
if (!(sd->flag & SD_EXTINCTION)) {
return false;
@@ -98,9 +95,8 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals kg,
ccl_private VolumeShaderCoefficients *coeff)
{
const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
- shader_eval_volume<false>(kg, state, sd, path_flag, [=](const int i) {
- return integrator_state_read_volume_stack(state, i);
- });
+ VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i))
+ shader_eval_volume<false>(kg, state, sd, path_flag, volume_read_lambda_pass);
if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION))) {
return false;
@@ -772,7 +768,7 @@ ccl_device_forceinline void integrate_volume_direct_light(
const float phase_pdf = shader_volume_phase_eval(kg, sd, phases, ls->D, &phase_eval);
if (ls->shader & SHADER_USE_MIS) {
- float mis_weight = power_heuristic(ls->pdf, phase_pdf);
+ float mis_weight = light_sample_mis_weight_nee(kg, ls->pdf, phase_pdf);
bsdf_eval_mul(&phase_eval, mis_weight);
}
@@ -805,9 +801,10 @@ ccl_device_forceinline void integrate_volume_direct_light(
const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval);
if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
- const float3 pass_diffuse_weight = (bounce == 0) ?
- one_float3() :
- INTEGRATOR_STATE(state, path, pass_diffuse_weight);
+ const packed_float3 pass_diffuse_weight = (bounce == 0) ?
+ packed_float3(one_float3()) :
+ INTEGRATOR_STATE(
+ state, path, pass_diffuse_weight);
INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = zero_float3();
}
@@ -932,8 +929,8 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,
VOLUME_SAMPLE_DISTANCE;
/* Step through volume. */
- const float step_size = volume_stack_step_size(
- kg, [=](const int i) { return integrator_state_read_volume_stack(state, i); });
+ VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i))
+ const float step_size = volume_stack_step_size(kg, volume_read_lambda_pass);
/* TODO: expensive to zero closures? */
VolumeIntegrateResult result = {};
diff --git a/intern/cycles/kernel/integrator/shadow_state_template.h b/intern/cycles/kernel/integrator/shadow_state_template.h
index 667ab88c8c4..625a429d3db 100644
--- a/intern/cycles/kernel/integrator/shadow_state_template.h
+++ b/intern/cycles/kernel/integrator/shadow_state_template.h
@@ -40,15 +40,15 @@ KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, queued_kernel, KERNEL_FEATURE_PATH_T
/* enum PathRayFlag */
KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
/* Throughput. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING)
/* Throughput for shadow pass. */
KERNEL_STRUCT_MEMBER(shadow_path,
- float3,
+ packed_float3,
unshadowed_throughput,
KERNEL_FEATURE_SHADOW_PASS | KERNEL_FEATURE_AO_ADDITIVE)
/* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
/* Number of intersections found by ray-tracing. */
KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
KERNEL_STRUCT_END(shadow_path)
@@ -56,8 +56,8 @@ KERNEL_STRUCT_END(shadow_path)
/********************************** Shadow Ray *******************************/
KERNEL_STRUCT_BEGIN(shadow_ray)
-KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
diff --git a/intern/cycles/kernel/integrator/state_template.h b/intern/cycles/kernel/integrator/state_template.h
index 3299f973713..bd18a7498a3 100644
--- a/intern/cycles/kernel/integrator/state_template.h
+++ b/intern/cycles/kernel/integrator/state_template.h
@@ -59,12 +59,12 @@ KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
/* Continuation probability for path termination. */
KERNEL_STRUCT_MEMBER(path, float, continuation_probability, KERNEL_FEATURE_PATH_TRACING)
/* Throughput. */
-KERNEL_STRUCT_MEMBER(path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING)
/* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
/* Denoising. */
-KERNEL_STRUCT_MEMBER(path, float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
+KERNEL_STRUCT_MEMBER(path, packed_float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
/* Shader sorting. */
/* TODO: compress as uint16? or leave out entirely and recompute key in sorting code? */
KERNEL_STRUCT_MEMBER(path, uint32_t, shader_sort_key, KERNEL_FEATURE_PATH_TRACING)
@@ -73,8 +73,8 @@ KERNEL_STRUCT_END(path)
/************************************** Ray ***********************************/
KERNEL_STRUCT_BEGIN(ray)
-KERNEL_STRUCT_MEMBER(ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
KERNEL_STRUCT_MEMBER(ray, float, t, KERNEL_FEATURE_PATH_TRACING)
KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING)
KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
@@ -96,10 +96,10 @@ KERNEL_STRUCT_END(isect)
/*************** Subsurface closure state for subsurface kernel ***************/
KERNEL_STRUCT_BEGIN(subsurface)
-KERNEL_STRUCT_MEMBER(subsurface, float3, albedo, KERNEL_FEATURE_SUBSURFACE)
-KERNEL_STRUCT_MEMBER(subsurface, float3, radius, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, packed_float3, albedo, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, packed_float3, radius, KERNEL_FEATURE_SUBSURFACE)
KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE)
-KERNEL_STRUCT_MEMBER(subsurface, float3, Ng, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, packed_float3, Ng, KERNEL_FEATURE_SUBSURFACE)
KERNEL_STRUCT_END(subsurface)
/********************************** Volume Stack ******************************/
diff --git a/intern/cycles/kernel/integrator/volume_stack.h b/intern/cycles/kernel/integrator/volume_stack.h
index cf69826ffff..ea3fa901e2d 100644
--- a/intern/cycles/kernel/integrator/volume_stack.h
+++ b/intern/cycles/kernel/integrator/volume_stack.h
@@ -18,6 +18,14 @@
CCL_NAMESPACE_BEGIN
+/* Volumetric read/write lambda functions - default implementations */
+#ifndef VOLUME_READ_LAMBDA
+# define VOLUME_READ_LAMBDA(function_call) \
+ auto volume_read_lambda_pass = [=](const int i) { return function_call; };
+# define VOLUME_WRITE_LAMBDA(function_call) \
+ auto volume_write_lambda_pass = [=](const int i, VolumeStack entry) { function_call; };
+#endif
+
/* Volume Stack
*
* This is an array of object/shared ID's that the current segment of the path
@@ -88,26 +96,18 @@ ccl_device void volume_stack_enter_exit(KernelGlobals kg,
IntegratorState state,
ccl_private const ShaderData *sd)
{
- volume_stack_enter_exit(
- kg,
- sd,
- [=](const int i) { return integrator_state_read_volume_stack(state, i); },
- [=](const int i, const VolumeStack entry) {
- integrator_state_write_volume_stack(state, i, entry);
- });
+ VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i))
+ VOLUME_WRITE_LAMBDA(integrator_state_write_volume_stack(state, i, entry))
+ volume_stack_enter_exit(kg, sd, volume_read_lambda_pass, volume_write_lambda_pass);
}
ccl_device void shadow_volume_stack_enter_exit(KernelGlobals kg,
IntegratorShadowState state,
ccl_private const ShaderData *sd)
{
- volume_stack_enter_exit(
- kg,
- sd,
- [=](const int i) { return integrator_state_read_shadow_volume_stack(state, i); },
- [=](const int i, const VolumeStack entry) {
- integrator_state_write_shadow_volume_stack(state, i, entry);
- });
+ VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i))
+ VOLUME_WRITE_LAMBDA(integrator_state_write_shadow_volume_stack(state, i, entry))
+ volume_stack_enter_exit(kg, sd, volume_read_lambda_pass, volume_write_lambda_pass);
}
/* Clean stack after the last bounce.
diff --git a/intern/cycles/kernel/light/light.h b/intern/cycles/kernel/light/light.h
index 97dca936552..e0a9f1c57f5 100644
--- a/intern/cycles/kernel/light/light.h
+++ b/intern/cycles/kernel/light/light.h
@@ -676,19 +676,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals kg,
ls->D = z * B + safe_sqrtf(1.0f - z * z) * safe_normalize(C_ - dot(C_, B) * B);
/* calculate intersection with the planar triangle */
- if (!ray_triangle_intersect(P,
- ls->D,
- FLT_MAX,
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
- (ssef *)V,
-#else
- V[0],
- V[1],
- V[2],
-#endif
- &ls->u,
- &ls->v,
- &ls->t)) {
+ if (!ray_triangle_intersect(P, ls->D, FLT_MAX, V[0], V[1], V[2], &ls->u, &ls->v, &ls->t)) {
ls->pdf = 0.0f;
return;
}
diff --git a/intern/cycles/kernel/light/sample.h b/intern/cycles/kernel/light/sample.h
index 6b643a95250..ff5d43ed8cd 100644
--- a/intern/cycles/kernel/light/sample.h
+++ b/intern/cycles/kernel/light/sample.h
@@ -22,6 +22,7 @@
#include "kernel/light/light.h"
#include "kernel/sample/mapping.h"
+#include "kernel/sample/mis.h"
CCL_NAMESPACE_BEGIN
@@ -268,4 +269,36 @@ ccl_device_inline void light_sample_to_volume_shadow_ray(
shadow_ray_setup(sd, ls, P, ray);
}
+ccl_device_inline float light_sample_mis_weight_forward(KernelGlobals kg,
+ const float forward_pdf,
+ const float nee_pdf)
+{
+#ifdef WITH_CYCLES_DEBUG
+ if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_FORWARD) {
+ return 1.0f;
+ }
+ else if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_NEE) {
+ return 0.0f;
+ }
+ else
+#endif
+ return power_heuristic(forward_pdf, nee_pdf);
+}
+
+ccl_device_inline float light_sample_mis_weight_nee(KernelGlobals kg,
+ const float nee_pdf,
+ const float forward_pdf)
+{
+#ifdef WITH_CYCLES_DEBUG
+ if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_FORWARD) {
+ return 0.0f;
+ }
+ else if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_NEE) {
+ return 1.0f;
+ }
+ else
+#endif
+ return power_heuristic(nee_pdf, forward_pdf);
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt
index f226c95766f..90e16bd70d4 100644
--- a/intern/cycles/kernel/osl/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/CMakeLists.txt
@@ -55,7 +55,7 @@ if(APPLE)
# Disable allocation warning on macOS prior to 10.14: the OSLRenderServices
# contains member which is 64 bytes aligned (cache inside of OIIO's
# unordered_map_concurrent). This is not something what the SDK supportsm, but
- # since we take care of allocations ourselves is is OK to ignore the
+ # since we take care of allocations ourselves is OK to ignore the
# diagnostic message.
string(APPEND CMAKE_CXX_FLAGS " -faligned-allocation")
endif()
diff --git a/intern/cycles/kernel/sample/lcg.h b/intern/cycles/kernel/sample/lcg.h
index 92cfff639b4..e8c4915813e 100644
--- a/intern/cycles/kernel/sample/lcg.h
+++ b/intern/cycles/kernel/sample/lcg.h
@@ -19,14 +19,16 @@ CCL_NAMESPACE_BEGIN
/* Linear Congruential Generator */
-ccl_device uint lcg_step_uint(uint *rng)
+/* This is templated to handle multiple address spaces on Metal. */
+template<class T> ccl_device uint lcg_step_uint(T rng)
{
/* implicit mod 2^32 */
*rng = (1103515245 * (*rng) + 12345);
return *rng;
}
-ccl_device float lcg_step_float(uint *rng)
+/* This is templated to handle multiple address spaces on Metal. */
+template<class T> ccl_device float lcg_step_float(T rng)
{
/* implicit mod 2^32 */
*rng = (1103515245 * (*rng) + 12345);
diff --git a/intern/cycles/kernel/sample/pattern.h b/intern/cycles/kernel/sample/pattern.h
index 0c27992c7f6..adc8493badd 100644
--- a/intern/cycles/kernel/sample/pattern.h
+++ b/intern/cycles/kernel/sample/pattern.h
@@ -163,18 +163,7 @@ ccl_device_inline bool sample_is_even(int pattern, int sample)
/* See Section 10.2.1, "Progressive Multi-Jittered Sample Sequences", Christensen et al.
* We can use this to get divide sample sequence into two classes for easier variance
* estimation. */
-#if defined(__GNUC__) && !defined(__KERNEL_GPU__)
- return __builtin_popcount(sample & 0xaaaaaaaa) & 1;
-#elif defined(__NVCC__)
- return __popc(sample & 0xaaaaaaaa) & 1;
-#else
- /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
- int i = sample & 0xaaaaaaaa;
- i = i - ((i >> 1) & 0x55555555);
- i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
- i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
- return i & 1;
-#endif
+ return popcount(uint(sample) & 0xaaaaaaaa) & 1;
}
else {
/* TODO(Stefan): Are there reliable ways of dividing CMJ and Sobol into two classes? */
diff --git a/intern/cycles/kernel/svm/math_util.h b/intern/cycles/kernel/svm/math_util.h
index b2e539cdd1f..20817cd0fd3 100644
--- a/intern/cycles/kernel/svm/math_util.h
+++ b/intern/cycles/kernel/svm/math_util.h
@@ -212,33 +212,6 @@ ccl_device float3 svm_math_blackbody_color(float t)
* which is enough to get the same 8 bit/channel color.
*/
- const float blackbody_table_r[6][3] = {
- {2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f},
- {3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f},
- {4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f},
- {4.66849800e+03f, 2.85655028e-05f, 1.29075375e-01f},
- {4.60124770e+03f, 2.89727618e-05f, 1.48001316e-01f},
- {3.78765709e+03f, 9.36026367e-06f, 3.98995841e-01f},
- };
-
- const float blackbody_table_g[6][3] = {
- {-7.50343014e+02f, 3.15679613e-04f, 4.73464526e-01f},
- {-1.00402363e+03f, 1.29189794e-04f, 9.08181524e-01f},
- {-1.22075471e+03f, 2.56245413e-05f, 1.20753416e+00f},
- {-1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f},
- {-1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f},
- {-5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f},
- };
-
- const float blackbody_table_b[6][4] = {
- {0.0f, 0.0f, 0.0f, 0.0f}, /* zeros should be optimized by compiler */
- {0.0f, 0.0f, 0.0f, 0.0f},
- {0.0f, 0.0f, 0.0f, 0.0f},
- {-2.02524603e-11f, 1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f},
- {-2.22463426e-13f, -1.55078698e-08f, 3.81675160e-04f, -7.30646033e-01f},
- {6.72595954e-13f, -2.73059993e-08f, 4.24068546e-04f, -7.52204323e-01f},
- };
-
if (t >= 12000.0f) {
return make_float3(0.826270103f, 0.994478524f, 1.56626022f);
}
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 62ba5bf04e3..ce32e1a520f 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -220,7 +220,7 @@ CCL_NAMESPACE_BEGIN
template<uint node_feature_mask, ShaderType type, typename ConstIntegratorGenericState>
ccl_device void svm_eval_nodes(KernelGlobals kg,
ConstIntegratorGenericState state,
- ShaderData *sd,
+ ccl_private ShaderData *sd,
ccl_global float *render_buffer,
uint32_t path_flag)
{
diff --git a/intern/cycles/kernel/svm/wavelength.h b/intern/cycles/kernel/svm/wavelength.h
index 28fd172abc7..6e25224243f 100644
--- a/intern/cycles/kernel/svm/wavelength.h
+++ b/intern/cycles/kernel/svm/wavelength.h
@@ -42,41 +42,6 @@ ccl_device_noinline void svm_node_wavelength(KernelGlobals kg,
uint wavelength,
uint color_out)
{
- // CIE colour matching functions xBar, yBar, and zBar for
- // wavelengths from 380 through 780 nanometers, every 5
- // nanometers. For a wavelength lambda in this range:
- // cie_colour_match[(lambda - 380) / 5][0] = xBar
- // cie_colour_match[(lambda - 380) / 5][1] = yBar
- // cie_colour_match[(lambda - 380) / 5][2] = zBar
- const float cie_colour_match[81][3] = {
- {0.0014f, 0.0000f, 0.0065f}, {0.0022f, 0.0001f, 0.0105f}, {0.0042f, 0.0001f, 0.0201f},
- {0.0076f, 0.0002f, 0.0362f}, {0.0143f, 0.0004f, 0.0679f}, {0.0232f, 0.0006f, 0.1102f},
- {0.0435f, 0.0012f, 0.2074f}, {0.0776f, 0.0022f, 0.3713f}, {0.1344f, 0.0040f, 0.6456f},
- {0.2148f, 0.0073f, 1.0391f}, {0.2839f, 0.0116f, 1.3856f}, {0.3285f, 0.0168f, 1.6230f},
- {0.3483f, 0.0230f, 1.7471f}, {0.3481f, 0.0298f, 1.7826f}, {0.3362f, 0.0380f, 1.7721f},
- {0.3187f, 0.0480f, 1.7441f}, {0.2908f, 0.0600f, 1.6692f}, {0.2511f, 0.0739f, 1.5281f},
- {0.1954f, 0.0910f, 1.2876f}, {0.1421f, 0.1126f, 1.0419f}, {0.0956f, 0.1390f, 0.8130f},
- {0.0580f, 0.1693f, 0.6162f}, {0.0320f, 0.2080f, 0.4652f}, {0.0147f, 0.2586f, 0.3533f},
- {0.0049f, 0.3230f, 0.2720f}, {0.0024f, 0.4073f, 0.2123f}, {0.0093f, 0.5030f, 0.1582f},
- {0.0291f, 0.6082f, 0.1117f}, {0.0633f, 0.7100f, 0.0782f}, {0.1096f, 0.7932f, 0.0573f},
- {0.1655f, 0.8620f, 0.0422f}, {0.2257f, 0.9149f, 0.0298f}, {0.2904f, 0.9540f, 0.0203f},
- {0.3597f, 0.9803f, 0.0134f}, {0.4334f, 0.9950f, 0.0087f}, {0.5121f, 1.0000f, 0.0057f},
- {0.5945f, 0.9950f, 0.0039f}, {0.6784f, 0.9786f, 0.0027f}, {0.7621f, 0.9520f, 0.0021f},
- {0.8425f, 0.9154f, 0.0018f}, {0.9163f, 0.8700f, 0.0017f}, {0.9786f, 0.8163f, 0.0014f},
- {1.0263f, 0.7570f, 0.0011f}, {1.0567f, 0.6949f, 0.0010f}, {1.0622f, 0.6310f, 0.0008f},
- {1.0456f, 0.5668f, 0.0006f}, {1.0026f, 0.5030f, 0.0003f}, {0.9384f, 0.4412f, 0.0002f},
- {0.8544f, 0.3810f, 0.0002f}, {0.7514f, 0.3210f, 0.0001f}, {0.6424f, 0.2650f, 0.0000f},
- {0.5419f, 0.2170f, 0.0000f}, {0.4479f, 0.1750f, 0.0000f}, {0.3608f, 0.1382f, 0.0000f},
- {0.2835f, 0.1070f, 0.0000f}, {0.2187f, 0.0816f, 0.0000f}, {0.1649f, 0.0610f, 0.0000f},
- {0.1212f, 0.0446f, 0.0000f}, {0.0874f, 0.0320f, 0.0000f}, {0.0636f, 0.0232f, 0.0000f},
- {0.0468f, 0.0170f, 0.0000f}, {0.0329f, 0.0119f, 0.0000f}, {0.0227f, 0.0082f, 0.0000f},
- {0.0158f, 0.0057f, 0.0000f}, {0.0114f, 0.0041f, 0.0000f}, {0.0081f, 0.0029f, 0.0000f},
- {0.0058f, 0.0021f, 0.0000f}, {0.0041f, 0.0015f, 0.0000f}, {0.0029f, 0.0010f, 0.0000f},
- {0.0020f, 0.0007f, 0.0000f}, {0.0014f, 0.0005f, 0.0000f}, {0.0010f, 0.0004f, 0.0000f},
- {0.0007f, 0.0002f, 0.0000f}, {0.0005f, 0.0002f, 0.0000f}, {0.0003f, 0.0001f, 0.0000f},
- {0.0002f, 0.0001f, 0.0000f}, {0.0002f, 0.0001f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f},
- {0.0001f, 0.0000f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0000f, 0.0000f, 0.0000f}};
-
float lambda_nm = stack_load_float(stack, wavelength);
float ii = (lambda_nm - 380.0f) * (1.0f / 5.0f); // scaled 0..80
int i = float_to_int(ii);
diff --git a/intern/cycles/kernel/tables.h b/intern/cycles/kernel/tables.h
new file mode 100644
index 00000000000..768033d4ffe
--- /dev/null
+++ b/intern/cycles/kernel/tables.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* clang-format off */
+
+ccl_inline_constant float blackbody_table_r[][3] = {
+ {2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f},
+ {3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f},
+ {4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f},
+ {4.66849800e+03f, 2.85655028e-05f, 1.29075375e-01f},
+ {4.60124770e+03f, 2.89727618e-05f, 1.48001316e-01f},
+ {3.78765709e+03f, 9.36026367e-06f, 3.98995841e-01f}
+};
+
+ccl_inline_constant float blackbody_table_g[][3] = {
+ {-7.50343014e+02f, 3.15679613e-04f, 4.73464526e-01f},
+ {-1.00402363e+03f, 1.29189794e-04f, 9.08181524e-01f},
+ {-1.22075471e+03f, 2.56245413e-05f, 1.20753416e+00f},
+ {-1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f},
+ {-1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f},
+ {-5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f}
+};
+
+ccl_inline_constant float blackbody_table_b[][4] = {
+ {0.0f, 0.0f, 0.0f, 0.0f}, /* zeros should be optimized by compiler */
+ {0.0f, 0.0f, 0.0f, 0.0f},
+ {0.0f, 0.0f, 0.0f, 0.0f},
+ {-2.02524603e-11f, 1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f},
+ {-2.22463426e-13f, -1.55078698e-08f, 3.81675160e-04f, -7.30646033e-01f},
+ {6.72595954e-13f, -2.73059993e-08f, 4.24068546e-04f, -7.52204323e-01f}
+};
+
+ccl_inline_constant float cie_colour_match[][3] = {
+ {0.0014f, 0.0000f, 0.0065f}, {0.0022f, 0.0001f, 0.0105f}, {0.0042f, 0.0001f, 0.0201f},
+ {0.0076f, 0.0002f, 0.0362f}, {0.0143f, 0.0004f, 0.0679f}, {0.0232f, 0.0006f, 0.1102f},
+ {0.0435f, 0.0012f, 0.2074f}, {0.0776f, 0.0022f, 0.3713f}, {0.1344f, 0.0040f, 0.6456f},
+ {0.2148f, 0.0073f, 1.0391f}, {0.2839f, 0.0116f, 1.3856f}, {0.3285f, 0.0168f, 1.6230f},
+ {0.3483f, 0.0230f, 1.7471f}, {0.3481f, 0.0298f, 1.7826f}, {0.3362f, 0.0380f, 1.7721f},
+ {0.3187f, 0.0480f, 1.7441f}, {0.2908f, 0.0600f, 1.6692f}, {0.2511f, 0.0739f, 1.5281f},
+ {0.1954f, 0.0910f, 1.2876f}, {0.1421f, 0.1126f, 1.0419f}, {0.0956f, 0.1390f, 0.8130f},
+ {0.0580f, 0.1693f, 0.6162f}, {0.0320f, 0.2080f, 0.4652f}, {0.0147f, 0.2586f, 0.3533f},
+ {0.0049f, 0.3230f, 0.2720f}, {0.0024f, 0.4073f, 0.2123f}, {0.0093f, 0.5030f, 0.1582f},
+ {0.0291f, 0.6082f, 0.1117f}, {0.0633f, 0.7100f, 0.0782f}, {0.1096f, 0.7932f, 0.0573f},
+ {0.1655f, 0.8620f, 0.0422f}, {0.2257f, 0.9149f, 0.0298f}, {0.2904f, 0.9540f, 0.0203f},
+ {0.3597f, 0.9803f, 0.0134f}, {0.4334f, 0.9950f, 0.0087f}, {0.5121f, 1.0000f, 0.0057f},
+ {0.5945f, 0.9950f, 0.0039f}, {0.6784f, 0.9786f, 0.0027f}, {0.7621f, 0.9520f, 0.0021f},
+ {0.8425f, 0.9154f, 0.0018f}, {0.9163f, 0.8700f, 0.0017f}, {0.9786f, 0.8163f, 0.0014f},
+ {1.0263f, 0.7570f, 0.0011f}, {1.0567f, 0.6949f, 0.0010f}, {1.0622f, 0.6310f, 0.0008f},
+ {1.0456f, 0.5668f, 0.0006f}, {1.0026f, 0.5030f, 0.0003f}, {0.9384f, 0.4412f, 0.0002f},
+ {0.8544f, 0.3810f, 0.0002f}, {0.7514f, 0.3210f, 0.0001f}, {0.6424f, 0.2650f, 0.0000f},
+ {0.5419f, 0.2170f, 0.0000f}, {0.4479f, 0.1750f, 0.0000f}, {0.3608f, 0.1382f, 0.0000f},
+ {0.2835f, 0.1070f, 0.0000f}, {0.2187f, 0.0816f, 0.0000f}, {0.1649f, 0.0610f, 0.0000f},
+ {0.1212f, 0.0446f, 0.0000f}, {0.0874f, 0.0320f, 0.0000f}, {0.0636f, 0.0232f, 0.0000f},
+ {0.0468f, 0.0170f, 0.0000f}, {0.0329f, 0.0119f, 0.0000f}, {0.0227f, 0.0082f, 0.0000f},
+ {0.0158f, 0.0057f, 0.0000f}, {0.0114f, 0.0041f, 0.0000f}, {0.0081f, 0.0029f, 0.0000f},
+ {0.0058f, 0.0021f, 0.0000f}, {0.0041f, 0.0015f, 0.0000f}, {0.0029f, 0.0010f, 0.0000f},
+ {0.0020f, 0.0007f, 0.0000f}, {0.0014f, 0.0005f, 0.0000f}, {0.0010f, 0.0004f, 0.0000f},
+ {0.0007f, 0.0002f, 0.0000f}, {0.0005f, 0.0002f, 0.0000f}, {0.0003f, 0.0001f, 0.0000f},
+ {0.0002f, 0.0001f, 0.0000f}, {0.0002f, 0.0001f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f},
+ {0.0001f, 0.0000f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0000f, 0.0000f, 0.0000f}
+};
+
+/* clang-format on */ \ No newline at end of file
diff --git a/intern/cycles/kernel/textures.h b/intern/cycles/kernel/textures.h
index 464ecb183cb..2e3ae29a19a 100644
--- a/intern/cycles/kernel/textures.h
+++ b/intern/cycles/kernel/textures.h
@@ -40,11 +40,11 @@ KERNEL_TEX(DecomposedTransform, __camera_motion)
/* triangles */
KERNEL_TEX(uint, __tri_shader)
-KERNEL_TEX(float4, __tri_vnormal)
+KERNEL_TEX(packed_float3, __tri_vnormal)
KERNEL_TEX(uint4, __tri_vindex)
KERNEL_TEX(uint, __tri_patch)
KERNEL_TEX(float2, __tri_patch_uv)
-KERNEL_TEX(float4, __tri_verts)
+KERNEL_TEX(packed_float3, __tri_verts)
/* curves */
KERNEL_TEX(KernelCurve, __curves)
@@ -58,7 +58,8 @@ KERNEL_TEX(uint, __patches)
KERNEL_TEX(uint4, __attributes_map)
KERNEL_TEX(float, __attributes_float)
KERNEL_TEX(float2, __attributes_float2)
-KERNEL_TEX(float4, __attributes_float3)
+KERNEL_TEX(packed_float3, __attributes_float3)
+KERNEL_TEX(float4, __attributes_float4)
KERNEL_TEX(uchar4, __attributes_uchar4)
/* lights */
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index 2f6cadf7496..4a730dbfaaa 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -36,13 +36,6 @@
# define __KERNEL_CPU__
#endif
-/* TODO(sergey): This is only to make it possible to include this header
- * from outside of the kernel. but this could be done somewhat cleaner?
- */
-#ifndef ccl_addr_space
-# define ccl_addr_space
-#endif
-
CCL_NAMESPACE_BEGIN
/* Constants */
@@ -489,6 +482,16 @@ enum PanoramaType {
PANORAMA_NUM_TYPES,
};
+/* Direct Light Sampling */
+
+enum DirectLightSamplingType {
+ DIRECT_LIGHT_SAMPLING_MIS = 0,
+ DIRECT_LIGHT_SAMPLING_FORWARD = 1,
+ DIRECT_LIGHT_SAMPLING_NEE = 2,
+
+ DIRECT_LIGHT_SAMPLING_NUM,
+};
+
/* Differential */
typedef struct differential3 {
@@ -1201,8 +1204,11 @@ typedef struct KernelIntegrator {
/* Closure filter. */
int filter_closures;
+ /* MIS debuging */
+ int direct_light_sampling_type;
+
/* padding */
- int pad1, pad2, pad3;
+ int pad1, pad2;
} KernelIntegrator;
static_assert_align(KernelIntegrator, 16);
@@ -1426,6 +1432,7 @@ typedef struct KernelWorkTile {
uint start_sample;
uint num_samples;
+ uint sample_offset;
int offset;
uint stride;