diff options
Diffstat (limited to 'intern/cycles/kernel/device')
-rw-r--r-- | intern/cycles/kernel/device/cpu/kernel_arch_impl.h | 10 | ||||
-rw-r--r-- | intern/cycles/kernel/device/gpu/kernel.h | 8 | ||||
-rw-r--r-- | intern/cycles/kernel/device/metal/compat.h | 16 | ||||
-rw-r--r-- | intern/cycles/kernel/device/metal/kernel.metal | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/device/oneapi/compat.h | 19 | ||||
-rw-r--r-- | intern/cycles/kernel/device/oneapi/kernel.cpp | 20 | ||||
-rw-r--r-- | intern/cycles/kernel/device/oneapi/kernel_templates.h | 2 |
7 files changed, 38 insertions, 39 deletions
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h index 0e5f7b4a2fd..0d7c06f4fc6 100644 --- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h +++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h @@ -34,7 +34,7 @@ # include "kernel/integrator/megakernel.h" # include "kernel/film/adaptive_sampling.h" -# include "kernel/film/id_passes.h" +# include "kernel/film/cryptomatte_passes.h" # include "kernel/film/read.h" # include "kernel/bake/bake.h" @@ -169,7 +169,7 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)( STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check); return false; #else - return kernel_adaptive_sampling_convergence_check( + return film_adaptive_sampling_convergence_check( kg, render_buffer, x, y, threshold, reset, offset, stride); #endif } @@ -185,7 +185,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCP #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x); #else - kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride); + film_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride); #endif } @@ -200,7 +200,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCP #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y); #else - kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride); + film_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride); #endif } @@ -215,7 +215,7 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU * #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess); #else - kernel_cryptomatte_post(kg, render_buffer, pixel_index); + film_cryptomatte_post(kg, render_buffer, pixel_index); #endif } diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index e1ab802aa80..d7d2000775f 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -526,7 +526,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) bool converged = true; if (x < sw && y < sh) { - converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check( + converged = ccl_gpu_kernel_call(film_adaptive_sampling_convergence_check( nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride)); } @@ -553,7 +553,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (y < sh) { ccl_gpu_kernel_call( - kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride)); + film_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride)); } } ccl_gpu_kernel_postfix @@ -572,7 +572,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (x < sw) { ccl_gpu_kernel_call( - kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride)); + film_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride)); } } ccl_gpu_kernel_postfix @@ -589,7 +589,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const int pixel_index = ccl_gpu_global_id_x(); if (pixel_index < num_pixels) { - ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index)); + ccl_gpu_kernel_call(film_cryptomatte_post(nullptr, render_buffer, pixel_index)); } } ccl_gpu_kernel_postfix diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index b20cfca9a9c..130a9ebafae 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -29,11 +29,12 @@ using namespace metal::raytracing; /* Qualifiers */ -#if defined(__KERNEL_METAL_APPLE__) +/* Inline everything for Apple GPUs. This gives ~1.1x speedup and 10% spill + * reduction for integator_shade_surface. However it comes at the cost of + * longer compile times (~4.5 minutes on M1 Max) and is disabled for that + * reason, until there is a user option to manually enable it. */ -/* Inline everything for Apple GPUs. - * This gives ~1.1x speedup and 10% spill reduction for integator_shade_surface - * at the cost of longer compile times (~4.5 minutes on M1 Max). */ +#if 0 // defined(__KERNEL_METAL_APPLE__) # define ccl_device __attribute__((always_inline)) # define ccl_device_inline __attribute__((always_inline)) @@ -45,8 +46,11 @@ using namespace metal::raytracing; # define ccl_device # define ccl_device_inline ccl_device # define ccl_device_forceinline ccl_device -# define ccl_device_noinline ccl_device __attribute__((noinline)) - +# if defined(__KERNEL_METAL_APPLE__) +# define ccl_device_noinline ccl_device +# else +# define ccl_device_noinline ccl_device __attribute__((noinline)) +# endif #endif #define ccl_device_noinline_cpu ccl_device diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal index 3de8c069c30..5646c7446db 100644 --- a/intern/cycles/kernel/device/metal/kernel.metal +++ b/intern/cycles/kernel/device/metal/kernel.metal @@ -321,7 +321,7 @@ inline TReturnType metalrt_visibility_test( constant KernelParamsMetal &launch_params_metal, ray_data MetalKernelContext::MetalRTIntersectionPayload &payload, const uint object, - const uint prim, + uint prim, const float u) { TReturnType result; diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h index 5c49674f247..8ae40b0612e 100644 --- a/intern/cycles/kernel/device/oneapi/compat.h +++ b/intern/cycles/kernel/device/oneapi/compat.h @@ -10,6 +10,7 @@ #define CCL_NAMESPACE_END #include <cstdint> +#include <math.h> #ifndef __NODES_MAX_GROUP__ # define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX @@ -174,21 +175,15 @@ using sycl::half; #define fmodf(x, y) sycl::fmod((x), (y)) #define lgammaf(x) sycl::lgamma((x)) -#define __forceinline __attribute__((always_inline)) - -/* Types */ -#include "util/half.h" -#include "util/types.h" - -/* NOTE(@nsirgien): Declaring these functions after types headers is very important because they - * include oneAPI headers, which transitively include math.h headers which will cause redefinitions - * of the math defines because math.h also uses them and having them defined before math.h include - * is actually UB. */ -/* Use fast math functions - get them from sycl::native namespace for native math function - * implementations */ #define cosf(x) sycl::native::cos(((float)(x))) #define sinf(x) sycl::native::sin(((float)(x))) #define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y))) #define tanf(x) sycl::native::tan(((float)(x))) #define logf(x) sycl::native::log(((float)(x))) #define expf(x) sycl::native::exp(((float)(x))) + +#define __forceinline __attribute__((always_inline)) + +/* Types */ +#include "util/half.h" +#include "util/types.h" diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp index 7e90c553c44..097d21b963f 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.cpp +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -645,13 +645,9 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, /* Unsupported kernels */ case DEVICE_KERNEL_NUM: case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL: - assert(0); - return false; + kernel_assert(0); + break; } - - /* Unknown kernel. */ - assert(0); - return false; }); } catch (sycl::exception const &e) { @@ -669,7 +665,11 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, return success; } -static const int lowest_supported_driver_version_win = 1011660; +/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows + * since Windows driver 101.3268. */ +/* The same min compute-runtime version is currently required across Windows and Linux. + * For Windows driver 101.3268, compute-runtime version is 23570. */ +static const int lowest_supported_driver_version_win = 1013268; static const int lowest_supported_driver_version_neo = 23570; static int parse_driver_build_version(const sycl::device &device) @@ -773,8 +773,7 @@ static std::vector<sycl::device> oneapi_available_devices() int driver_build_version = parse_driver_build_version(device); if ((driver_build_version > 100000 && driver_build_version < lowest_supported_driver_version_win) || - (driver_build_version > 0 && - driver_build_version < lowest_supported_driver_version_neo)) { + driver_build_version < lowest_supported_driver_version_neo) { filter_out = true; } } @@ -818,7 +817,8 @@ char *oneapi_device_capabilities() GET_NUM_ATTR(max_compute_units) GET_NUM_ATTR(max_work_item_dimensions) - sycl::id<3> max_work_item_sizes = device.get_info<sycl::info::device::max_work_item_sizes>(); + sycl::id<3> max_work_item_sizes = + device.get_info<sycl::info::device::max_work_item_sizes<3>>(); WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0))) WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1))) WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2))) diff --git a/intern/cycles/kernel/device/oneapi/kernel_templates.h b/intern/cycles/kernel/device/oneapi/kernel_templates.h index d8964d9b672..0ae925cf748 100644 --- a/intern/cycles/kernel/device/oneapi/kernel_templates.h +++ b/intern/cycles/kernel/device/oneapi/kernel_templates.h @@ -80,7 +80,7 @@ void oneapi_call( (x, ##__VA_ARGS__) /* This template automatically casts entries in the void **args array to the types requested by the kernel func. - Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */ + * Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */ #define oneapi_template(...) \ template<ONEAPI_CALL_FOR(ONEAPI_TYP, __VA_ARGS__)> \ void oneapi_call( \ |