From a02992f1313811c9905e44dc95a0aee31d707f67 Mon Sep 17 00:00:00 2001 From: Xavier Hallade Date: Wed, 29 Jun 2022 12:58:04 +0200 Subject: Cycles: Add support for rendering on Intel GPUs using oneAPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a new Cycles device with similar functionality to the existing GPU devices. Kernel compilation and runtime interaction happen via oneAPI DPC++ compiler and SYCL API. This implementation is primarly focusing on Intel® Arc™ GPUs and other future Intel GPUs. The first supported drivers are 101.1660 on Windows and 22.10.22597 on Linux. The necessary tools for compilation are: - A SYCL compiler such as oneAPI DPC++ compiler or https://github.com/intel/llvm - Intel® oneAPI Level Zero which is used for low level device queries: https://github.com/oneapi-src/level-zero - To optionally generate prebuilt graphics binaries: Intel® Graphics Compiler All are included in Linux precompiled libraries on svn: https://svn.blender.org/svnroot/bf-blender/trunk/lib The same goes for Windows precompiled binaries but for the graphics compiler, available as "Intel® Graphics Offline Compiler for OpenCL™ Code" from https://www.intel.com/content/www/us/en/developer/articles/tool/oneapi-standalone-components.html, for which path can be set as OCLOC_INSTALL_DIR. Being based on the open SYCL standard, this implementation could also be extended to run on other compatible non-Intel hardware in the future. Reviewed By: sergey, brecht Differential Revision: https://developer.blender.org/D15254 Co-authored-by: Nikita Sirgienko Co-authored-by: Stefan Werner --- intern/cycles/kernel/CMakeLists.txt | 226 ++++++ intern/cycles/kernel/device/gpu/kernel.h | 4 + .../kernel/device/gpu/parallel_active_index.h | 100 ++- intern/cycles/kernel/device/oneapi/compat.h | 206 +++++ intern/cycles/kernel/device/oneapi/context_begin.h | 13 + intern/cycles/kernel/device/oneapi/context_end.h | 7 + intern/cycles/kernel/device/oneapi/device_id.h | 11 + .../kernel/device/oneapi/dll_interface_template.h | 50 ++ intern/cycles/kernel/device/oneapi/globals.h | 47 ++ intern/cycles/kernel/device/oneapi/image.h | 385 +++++++++ intern/cycles/kernel/device/oneapi/kernel.cpp | 884 +++++++++++++++++++++ intern/cycles/kernel/device/oneapi/kernel.h | 57 ++ .../cycles/kernel/device/oneapi/kernel_templates.h | 121 +++ intern/cycles/kernel/types.h | 2 +- 14 files changed, 2105 insertions(+), 8 deletions(-) create mode 100644 intern/cycles/kernel/device/oneapi/compat.h create mode 100644 intern/cycles/kernel/device/oneapi/context_begin.h create mode 100644 intern/cycles/kernel/device/oneapi/context_end.h create mode 100644 intern/cycles/kernel/device/oneapi/device_id.h create mode 100644 intern/cycles/kernel/device/oneapi/dll_interface_template.h create mode 100644 intern/cycles/kernel/device/oneapi/globals.h create mode 100644 intern/cycles/kernel/device/oneapi/image.h create mode 100644 intern/cycles/kernel/device/oneapi/kernel.cpp create mode 100644 intern/cycles/kernel/device/oneapi/kernel.h create mode 100644 intern/cycles/kernel/device/oneapi/kernel_templates.h (limited to 'intern/cycles/kernel') diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index a07d7852211..ccd694dfdfd 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -37,6 +37,10 @@ set(SRC_KERNEL_DEVICE_OPTIX device/optix/kernel_shader_raytrace.cu ) +set(SRC_KERNEL_DEVICE_ONEAPI + device/oneapi/kernel.cpp +) + set(SRC_KERNEL_DEVICE_CPU_HEADERS device/cpu/compat.h device/cpu/image.h @@ -78,6 +82,17 @@ set(SRC_KERNEL_DEVICE_METAL_HEADERS device/metal/globals.h ) +set(SRC_KERNEL_DEVICE_ONEAPI_HEADERS + device/oneapi/compat.h + device/oneapi/context_begin.h + device/oneapi/context_end.h + device/oneapi/device_id.h + device/oneapi/globals.h + device/oneapi/image.h + device/oneapi/kernel.h + device/oneapi/kernel_templates.h +) + set(SRC_KERNEL_CLOSURE_HEADERS closure/alloc.h closure/bsdf.h @@ -687,6 +702,212 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) cycles_set_solution_folder(cycles_kernel_optix) endif() +if(WITH_CYCLES_DEVICE_ONEAPI) + if(WIN32) + set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.dll) + else() + set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.so) + endif() + + set(cycles_oneapi_kernel_sources + ${SRC_KERNEL_DEVICE_ONEAPI} + ${SRC_KERNEL_HEADERS} + ${SRC_KERNEL_DEVICE_GPU_HEADERS} + ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS} + ${SRC_UTIL_HEADERS} + ) + + # SYCL_CPP_FLAGS is a variable that the user can set to pass extra compiler options + set(sycl_compiler_flags + ${CMAKE_CURRENT_SOURCE_DIR}/${SRC_KERNEL_DEVICE_ONEAPI} + -fsycl + -fsycl-unnamed-lambda + -fdelayed-template-parsing + -mllvm -inlinedefault-threshold=300 + -mllvm -inlinehint-threshold=400 + -shared + -DWITH_ONEAPI + -ffast-math + -DNDEBUG + -O2 + -o ${cycles_kernel_oneapi_lib} + -I${CMAKE_CURRENT_SOURCE_DIR}/.. + -I${LEVEL_ZERO_INCLUDE_DIR} + ${LEVEL_ZERO_LIBRARY} + ${SYCL_CPP_FLAGS} + ) + + + if (WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED) + list(APPEND sycl_compiler_flags -DWITH_ONEAPI_SYCL_HOST_ENABLED) + endif() + + # Set defaults for spir64 and spir64_gen options + if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64) + set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'") + endif() + if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen) + SET (CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}" CACHE STRING "Extra build options for spir64_gen target") + endif() + # enabling zebin (graphics binary format with improved compatibility) on Windows only while support on Linux isn't available yet + if(WIN32) + string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "--format zebin ") + endif() + string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "-device ${CYCLES_ONEAPI_SPIR64_GEN_DEVICES} ") + + if (WITH_CYCLES_ONEAPI_BINARIES) + # Iterate over all targest and their options + list (JOIN CYCLES_ONEAPI_SYCL_TARGETS "," targets_string) + list (APPEND sycl_compiler_flags -fsycl-targets=${targets_string}) + foreach(target ${CYCLES_ONEAPI_SYCL_TARGETS}) + if(DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_${target}) + list (APPEND sycl_compiler_flags -Xsycl-target-backend=${target} "${CYCLES_ONEAPI_SYCL_OPTIONS_${target}}") + endif() + endforeach() + else() + # If AOT is disabled, build for spir64 + list(APPEND sycl_compiler_flags + -fsycl-targets=spir64 + -Xsycl-target-backend=spir64 "${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}") + endif() + + if(WITH_NANOVDB) + list(APPEND sycl_compiler_flags + -DWITH_NANOVDB + -I"${NANOVDB_INCLUDE_DIR}") + endif() + + if(WITH_CYCLES_DEBUG) + list(APPEND sycl_compiler_flags -DWITH_CYCLES_DEBUG) + endif() + + get_filename_component(sycl_compiler_root ${SYCL_COMPILER} DIRECTORY) + get_filename_component(sycl_compiler_compiler_name ${SYCL_COMPILER} NAME_WE) + + if(NOT OCLOC_INSTALL_DIR) + get_filename_component(OCLOC_INSTALL_DIR "${sycl_compiler_root}/../lib/ocloc" ABSOLUTE) + endif() + if(WITH_CYCLES_ONEAPI_BINARIES AND NOT EXISTS ${OCLOC_INSTALL_DIR}) + message(FATAL_ERROR "WITH_CYCLES_ONEAPI_BINARIES requires ocloc but ${OCLOC_INSTALL_DIR} directory doesn't exist." + " A different ocloc directory can be set using OCLOC_INSTALL_DIR cmake variable.") + endif() + + if(UNIX AND NOT APPLE) + if(NOT WITH_CXX11_ABI) + check_library_exists(sycl + _ZN2cl4sycl7handler22verifyUsedKernelBundleERKSs ${sycl_compiler_root}/../lib SYCL_NO_CXX11_ABI) + if(SYCL_NO_CXX11_ABI) + list(APPEND sycl_compiler_flags -D_GLIBCXX_USE_CXX11_ABI=0) + endif() + endif() + endif() + + if(WIN32) + list(APPEND sycl_compiler_flags + -fms-extensions + -fms-compatibility + -D_WINDLL + -D_MBCS + -DWIN32 + -D_WINDOWS + -D_CRT_NONSTDC_NO_DEPRECATE + -D_CRT_SECURE_NO_DEPRECATE + -DONEAPI_EXPORT) + + if(sycl_compiler_compiler_name MATCHES "dpcpp") + # The oneAPI distribution calls the compiler "dpcpp" and comes with a script that sets environment variables. + add_custom_command( + OUTPUT ${cycles_kernel_oneapi_lib} + COMMAND "${sycl_compiler_root}/../../env/vars.bat" + COMMAND ${SYCL_COMPILER} $<$:-g>$<$:-g> ${sycl_compiler_flags} + DEPENDS ${cycles_oneapi_kernel_sources}) + else() + # The open source SYCL compiler just goes by clang++ and does not have such a script. + # Set the variables manually. + string(REPLACE /Redist/ /Tools/ MSVC_TOOLS_DIR ${MSVC_REDIST_DIR}) + if(NOT CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION) # case for Ninja on Windows + get_filename_component(cmake_mt_dir ${CMAKE_MT} DIRECTORY) + string(REPLACE /bin/ /Lib/ WINDOWS_KIT_DIR ${cmake_mt_dir}) + get_filename_component(WINDOWS_KIT_DIR "${WINDOWS_KIT_DIR}/../" ABSOLUTE) + else() + set(WINDOWS_KIT_DIR ${WINDOWS_KITS_DIR}/Lib/${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION}) + endif() + list(APPEND sycl_compiler_flags + -L "${MSVC_TOOLS_DIR}/lib/x64" + -L "${WINDOWS_KIT_DIR}/um/x64" + -L "${WINDOWS_KIT_DIR}/ucrt/x64") + add_custom_command( + OUTPUT ${cycles_kernel_oneapi_lib} + COMMAND ${CMAKE_COMMAND} -E env + "LIB=${sycl_compiler_root}/../lib" # for compiler to find sycl.lib + "PATH=${OCLOC_INSTALL_DIR};${sycl_compiler_root}" + ${SYCL_COMPILER} $<$:-g>$<$:-g> ${sycl_compiler_flags} + DEPENDS ${cycles_oneapi_kernel_sources}) + endif() + else() + list(APPEND sycl_compiler_flags -fPIC) + + # avoid getting __FAST_MATH__ to be defined for the graphics compiler on CentOS 7 until the compile-time issue it triggers gets fixed. + if(WITH_CYCLES_ONEAPI_BINARIES) + list(APPEND sycl_compiler_flags -fhonor-nans) + endif() + + # add $ORIGIN to cycles_kernel_oneapi.so rpath so libsycl.so and + # libpi_level_zero.so can be placed next to it and get found. + list(APPEND sycl_compiler_flags -Wl,-rpath,'$$ORIGIN') + + # The oneAPI distribution calls the compiler "dpcpp" and comes with a script that sets environment variables. + if(sycl_compiler_compiler_name MATCHES "dpcpp") + add_custom_command( + OUTPUT ${cycles_kernel_oneapi_lib} + COMMAND bash -c \"source ${sycl_compiler_root}/../../env/vars.sh&&${SYCL_COMPILER} $<$:-g>$<$:-g> ${sycl_compiler_flags}\" + DEPENDS ${cycles_oneapi_kernel_sources}) + else() + # The open source SYCL compiler just goes by clang++ and does not have such a script. + # Set the variables manually. + if(NOT IGC_INSTALL_DIR) + get_filename_component(IGC_INSTALL_DIR "${sycl_compiler_root}/../lib/igc" ABSOLUTE) + endif() + add_custom_command( + OUTPUT ${cycles_kernel_oneapi_lib} + COMMAND ${CMAKE_COMMAND} -E env + "LD_LIBRARY_PATH=${sycl_compiler_root}/../lib:${OCLOC_INSTALL_DIR}/lib:${IGC_INSTALL_DIR}/lib" + "PATH=${OCLOC_INSTALL_DIR}/bin:${sycl_compiler_root}:$ENV{PATH}" # env PATH is for compiler to find ld + ${SYCL_COMPILER} $<$:-g>$<$:-g> ${sycl_compiler_flags} + DEPENDS ${cycles_oneapi_kernel_sources}) + endif() + endif() + + # install dynamic libraries required at runtime + if(WIN32) + set(SYCL_RUNTIME_DEPENDENCIES + sycl.dll + pi_level_zero.dll + ) + if(NOT WITH_BLENDER) + # For the Cycles standalone put libraries next to the Cycles application. + delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH}) + else() + # For Blender put the libraries next to the Blender executable. + # + # Note that the installation path in the delayed_install is relative to the versioned folder, + # which means we need to go one level up. + delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" "../") + endif() + elseif(UNIX AND NOT APPLE) + file(GLOB SYCL_RUNTIME_DEPENDENCIES + ${sycl_compiler_root}/../lib/libsycl.so + ${sycl_compiler_root}/../lib/libsycl.so.[0-9] + ${sycl_compiler_root}/../lib/libsycl.so.[0-9].[0-9].[0-9]-[0-9] + ) + list(APPEND SYCL_RUNTIME_DEPENDENCIES ${sycl_compiler_root}/../lib/libpi_level_zero.so) + delayed_install("" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH}/lib) + endif() + + delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cycles_kernel_oneapi_lib}" ${CYCLES_INSTALL_PATH}/lib) + add_custom_target(cycles_kernel_oneapi ALL DEPENDS ${cycles_kernel_oneapi_lib}) +endif() + # OSL module if(WITH_CYCLES_OSL) @@ -752,6 +973,7 @@ cycles_add_library(cycles_kernel "${LIB}" ${SRC_KERNEL_DEVICE_HIP_HEADERS} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS} ${SRC_KERNEL_DEVICE_METAL_HEADERS} + ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS} ) source_group("bake" FILES ${SRC_KERNEL_BAKE_HEADERS}) @@ -764,6 +986,7 @@ source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS}) source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS}) source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS}) source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS}) +source_group("device\\oneapi" FILES ${SRC_KERNEL_DEVICE_ONEAPI} ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS}) source_group("film" FILES ${SRC_KERNEL_FILM_HEADERS}) source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS}) source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS}) @@ -782,6 +1005,9 @@ endif() if(WITH_CYCLES_HIP) add_dependencies(cycles_kernel cycles_kernel_hip) endif() +if(WITH_CYCLES_DEVICE_ONEAPI) + add_dependencies(cycles_kernel cycles_kernel_oneapi) +endif() # Install kernel source for runtime compilation diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index d657571a5fa..b9a44ccad02 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -14,6 +14,8 @@ #ifdef __KERNEL_METAL__ # include "kernel/device/metal/context_begin.h" +#elif defined(__KERNEL_ONEAPI__) +# include "kernel/device/oneapi/context_begin.h" #endif #include "kernel/device/gpu/work_stealing.h" @@ -40,6 +42,8 @@ #ifdef __KERNEL_METAL__ # include "kernel/device/metal/context_end.h" +#elif defined(__KERNEL_ONEAPI__) +# include "kernel/device/oneapi/context_end.h" #endif #include "kernel/film/read.h" diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index 7d7266d5edf..c1df49c4f49 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -18,15 +18,68 @@ CCL_NAMESPACE_BEGIN # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 #endif -#ifndef __KERNEL_METAL__ +/* TODO: abstract more device differences, define ccl_gpu_local_syncthreads, + * ccl_gpu_thread_warp, ccl_gpu_warp_index, ccl_gpu_num_warps for all devices + * and keep device specific code in compat.h */ + +#ifdef __KERNEL_ONEAPI__ +# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED +template +void cpu_serial_active_index_array_impl(const uint num_states, + ccl_global int *ccl_restrict indices, + ccl_global int *ccl_restrict num_indices, + IsActiveOp is_active_op) +{ + int write_index = 0; + for (int state_index = 0; state_index < num_states; state_index++) { + if (is_active_op(state_index)) + indices[write_index++] = state_index; + } + *num_indices = write_index; + return; +} +# endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */ + +template +void gpu_parallel_active_index_array_impl(const uint num_states, + ccl_global int *ccl_restrict indices, + ccl_global int *ccl_restrict num_indices, + IsActiveOp is_active_op) +{ + const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>(); + const uint blocksize = item_id.get_local_range(0); + + sycl::multi_ptr + ptr = sycl::ext::oneapi::group_local_memory< + int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1]>(item_id.get_group()); + int *warp_offset = *ptr; + + /* NOTE(@nsirgien): Here we calculate the same value as below but + * faster for DPC++ : seems CUDA converting "%", "/", "*" based calculations below into + * something faster already but DPC++ doesn't, so it's better to use + * direct request of needed parameters - switching from this computation to computation below + * will cause 2.5x performance slowdown. */ + const uint thread_index = item_id.get_local_id(0); + const uint thread_warp = item_id.get_sub_group().get_local_id(); + + const uint warp_index = item_id.get_sub_group().get_group_id(); + const uint num_warps = item_id.get_sub_group().get_group_range()[0]; + + const uint state_index = item_id.get_global_id(0); + + /* Test if state corresponding to this thread is active. */ + const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; +#else /* !__KERNEL__ONEAPI__ */ +# ifndef __KERNEL_METAL__ template __device__ -#endif +# endif void gpu_parallel_active_index_array_impl(const uint num_states, ccl_global int *indices, ccl_global int *num_indices, -#ifdef __KERNEL_METAL__ +# ifdef __KERNEL_METAL__ const uint is_active, const uint blocksize, const int thread_index, @@ -37,7 +90,7 @@ __device__ const int num_warps, threadgroup int *warp_offset) { -#else +# else IsActiveOp is_active_op) { extern ccl_gpu_shared int warp_offset[]; @@ -52,18 +105,33 @@ __device__ /* Test if state corresponding to this thread is active. */ const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; -#endif - +# endif +#endif /* !__KERNEL_ONEAPI__ */ /* For each thread within a warp compute how many other active states precede it. */ +#ifdef __KERNEL_ONEAPI__ + const uint thread_offset = sycl::exclusive_scan_over_group( + item_id.get_sub_group(), is_active, std::plus<>()); +#else const uint thread_offset = popcount(ccl_gpu_ballot(is_active) & ccl_gpu_thread_mask(thread_warp)); +#endif /* Last thread in warp stores number of active states for each warp. */ +#ifdef __KERNEL_ONEAPI__ + if (thread_warp == item_id.get_sub_group().get_local_range()[0] - 1) { +#else if (thread_warp == ccl_gpu_warp_size - 1) { +#endif warp_offset[warp_index] = thread_offset + is_active; } +#ifdef __KERNEL_ONEAPI__ + /* NOTE(@nsirgien): For us here only local memory writing (warp_offset) is important, + * so faster local barriers can be used. */ + ccl_gpu_local_syncthreads(); +#else ccl_gpu_syncthreads(); +#endif /* Last thread in block converts per-warp sizes to offsets, increments global size of * index array and gets offset to write to. */ @@ -80,7 +148,13 @@ __device__ warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); } +#ifdef __KERNEL_ONEAPI__ + /* NOTE(@nsirgien): For us here only important local memory writing (warp_offset), + * so faster local barriers can be used. */ + ccl_gpu_local_syncthreads(); +#else ccl_gpu_syncthreads(); +#endif /* Write to index array. */ if (is_active) { @@ -107,7 +181,19 @@ __device__ simd_group_index, \ num_simd_groups, \ simdgroup_offset) - +#elif defined(__KERNEL_ONEAPI__) +# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED +# define gpu_parallel_active_index_array( \ + blocksize, num_states, indices, num_indices, is_active_op) \ + if (ccl_gpu_global_size_x() == 1) \ + cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \ + else \ + gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op); +# else +# define gpu_parallel_active_index_array( \ + blocksize, num_states, indices, num_indices, is_active_op) \ + gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op) +# endif #else # define gpu_parallel_active_index_array( \ diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h new file mode 100644 index 00000000000..30b0f088ede --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/compat.h @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#pragma once + +#define __KERNEL_GPU__ +#define __KERNEL_ONEAPI__ + +#define CCL_NAMESPACE_BEGIN +#define CCL_NAMESPACE_END + +#include + +#ifndef __NODES_MAX_GROUP__ +# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX +#endif +#ifndef __NODES_FEATURES__ +# define __NODES_FEATURES__ NODE_FEATURE_ALL +#endif + +/* This one does not have an abstraction. + * It's used by other devices directly. + */ + +#define __device__ + +/* Qualifier wrappers for different names on different devices */ + +#define ccl_device +#define ccl_global +#define ccl_always_inline __attribute__((always_inline)) +#define ccl_device_inline inline +#define ccl_noinline +#define ccl_inline_constant const constexpr +#define ccl_static_constant const +#define ccl_device_forceinline __attribute__((always_inline)) +#define ccl_device_noinline ccl_device ccl_noinline +#define ccl_device_noinline_cpu ccl_device +#define ccl_device_inline_method ccl_device +#define ccl_restrict __restrict__ +#define ccl_loop_no_unroll +#define ccl_optional_struct_init +#define ccl_private +#define ATTR_FALLTHROUGH __attribute__((fallthrough)) +#define ccl_constant const +#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) +#define ccl_align(n) __attribute__((aligned(n))) +#define kernel_assert(cond) +#define ccl_may_alias + +/* clang-format off */ + +/* kernel.h adapters */ +#define ccl_gpu_kernel(block_num_threads, thread_num_registers) +#define ccl_gpu_kernel_threads(block_num_threads) + +#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED +# define KG_ND_ITEMS \ + kg->nd_item_local_id_0 = item.get_local_id(0); \ + kg->nd_item_local_range_0 = item.get_local_range(0); \ + kg->nd_item_group_0 = item.get_group(0); \ + kg->nd_item_group_range_0 = item.get_group_range(0); \ + kg->nd_item_global_id_0 = item.get_global_id(0); \ + kg->nd_item_global_range_0 = item.get_global_range(0); +#else +# define KG_ND_ITEMS +#endif + +#define ccl_gpu_kernel_signature(name, ...) \ +void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ + size_t kernel_global_size, \ + size_t kernel_local_size, \ + sycl::handler &cgh, \ + __VA_ARGS__) { \ + (kg); \ + cgh.parallel_for( \ + sycl::nd_range<1>(kernel_global_size, kernel_local_size), \ + [=](sycl::nd_item<1> item) { \ + KG_ND_ITEMS + +#define ccl_gpu_kernel_postfix \ + }); \ + } + +#define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x + +#define ccl_gpu_kernel_lambda(func, ...) \ + struct KernelLambda \ + { \ + KernelLambda(const ONEAPIKernelContext *_kg) : kg(_kg) {} \ + ccl_private const ONEAPIKernelContext *kg; \ + __VA_ARGS__; \ + int operator()(const int state) const { return (func); } \ + } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg) + +/* GPU thread, block, grid size and index */ +#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED +# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0)) +# define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0)) +# define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0)) +# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0)) +# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) +# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) + +# define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0)) +# define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0)) +#else +# define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0) +# define ccl_gpu_block_dim_x (kg->nd_item_local_range_0) +# define ccl_gpu_block_idx_x (kg->nd_item_group_0) +# define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0) +# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) +# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) + +# define ccl_gpu_global_id_x() (kg->nd_item_global_id_0) +# define ccl_gpu_global_size_x() (kg->nd_item_global_range_0) +#endif + + +/* GPU warp synchronization */ + +#define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier() +#define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space) +#ifdef __SYCL_DEVICE_ONLY__ + #define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::experimental::this_sub_group(), predicate).count()) +#else + #define ccl_gpu_ballot(predicate) (predicate ? 1 : 0) +#endif + +/* Debug defines */ +#if defined(__SYCL_DEVICE_ONLY__) +# define CONSTANT __attribute__((opencl_constant)) +#else +# define CONSTANT +#endif + +#define sycl_printf(format, ...) { \ + static const CONSTANT char fmt[] = format; \ + sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ ); \ + } + +#define sycl_printf_(format) { \ + static const CONSTANT char fmt[] = format; \ + sycl::ext::oneapi::experimental::printf(fmt); \ + } + +/* GPU texture objects */ + +/* clang-format on */ + +/* Types */ +/* It's not possible to use sycl types like sycl::float3, sycl::int3, etc + * because these types have different interfaces from blender version */ + +using uchar = unsigned char; +using sycl::half; + +struct float3 { + float x, y, z; +}; + +ccl_always_inline float3 make_float3(float x, float y, float z) +{ + return {x, y, z}; +} +ccl_always_inline float3 make_float3(float x) +{ + return {x, x, x}; +} + +/* math functions */ +#define fabsf(x) sycl::fabs((x)) +#define copysignf(x, y) sycl::copysign((x), (y)) +#define asinf(x) sycl::asin((x)) +#define acosf(x) sycl::acos((x)) +#define atanf(x) sycl::atan((x)) +#define floorf(x) sycl::floor((x)) +#define ceilf(x) sycl::ceil((x)) +#define sinhf(x) sycl::sinh((x)) +#define coshf(x) sycl::cosh((x)) +#define tanhf(x) sycl::tanh((x)) +#define hypotf(x, y) sycl::hypot((x), (y)) +#define atan2f(x, y) sycl::atan2((x), (y)) +#define fmaxf(x, y) sycl::fmax((x), (y)) +#define fminf(x, y) sycl::fmin((x), (y)) +#define fmodf(x, y) sycl::fmod((x), (y)) +#define lgammaf(x) sycl::lgamma((x)) + +#define __forceinline __attribute__((always_inline)) + +/* Types */ +#include "util/half.h" +#include "util/types.h" + +/* NOTE(@nsirgien): Declaring these functions after types headers is very important because they + * include oneAPI headers, which transitively include math.h headers which will cause redefintions + * of the math defines because math.h also uses them and having them defined before math.h include + * is actually UB. */ +/* Use fast math functions - get them from sycl::native namespace for native math function + * implementations */ +#define cosf(x) sycl::native::cos(((float)(x))) +#define sinf(x) sycl::native::sin(((float)(x))) +#define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y))) +#define tanf(x) sycl::native::tan(((float)(x))) +#define logf(x) sycl::native::log(((float)(x))) +#define expf(x) sycl::native::exp(((float)(x))) diff --git a/intern/cycles/kernel/device/oneapi/context_begin.h b/intern/cycles/kernel/device/oneapi/context_begin.h new file mode 100644 index 00000000000..6d6f8cec4ca --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/context_begin.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#ifdef WITH_NANOVDB +# include +# include +#endif + +/* clang-format off */ +struct ONEAPIKernelContext : public KernelGlobalsGPU { + public: +# include "kernel/device/oneapi/image.h" + /* clang-format on */ diff --git a/intern/cycles/kernel/device/oneapi/context_end.h b/intern/cycles/kernel/device/oneapi/context_end.h new file mode 100644 index 00000000000..ddf0d1f1712 --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/context_end.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ +} +; /* end of ONEAPIKernelContext class definition */ + +#undef kernel_integrator_state +#define kernel_integrator_state (*(kg->integrator_state)) diff --git a/intern/cycles/kernel/device/oneapi/device_id.h b/intern/cycles/kernel/device/oneapi/device_id.h new file mode 100644 index 00000000000..b4c94ac27a2 --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/device_id.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#pragma once + +/* from public source : + * https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/include/pci_ids/iris_pci_ids.h */ +const static std::set intel_arc_alchemist_device_ids = { + 0x4f80, 0x4f81, 0x4f82, 0x4f83, 0x4f84, 0x4f87, 0x4f88, 0x5690, 0x5691, + 0x5692, 0x5693, 0x5694, 0x5695, 0x5696, 0x5697, 0x56a0, 0x56a1, 0x56a2, + 0x56a3, 0x56a4, 0x56a5, 0x56a6, 0x56b0, 0x56b1, 0x56b2, 0x56b3}; diff --git a/intern/cycles/kernel/device/oneapi/dll_interface_template.h b/intern/cycles/kernel/device/oneapi/dll_interface_template.h new file mode 100644 index 00000000000..2d740b4c64a --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/dll_interface_template.h @@ -0,0 +1,50 @@ +/* device_capabilities() returns a C string that must be free'd with oneapi_free(). */ +DLL_INTERFACE_CALL(oneapi_device_capabilities, char *) +DLL_INTERFACE_CALL(oneapi_free, void, void *) +DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue) + +DLL_INTERFACE_CALL(oneapi_get_compute_units_amount, size_t, SyclQueue *queue) +DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr) +DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr) + +DLL_INTERFACE_CALL(oneapi_create_queue, bool, SyclQueue *&external_queue, int device_index) +DLL_INTERFACE_CALL(oneapi_free_queue, void, SyclQueue *queue) +DLL_INTERFACE_CALL( + oneapi_usm_aligned_alloc_host, void *, SyclQueue *queue, size_t memory_size, size_t alignment) +DLL_INTERFACE_CALL(oneapi_usm_alloc_device, void *, SyclQueue *queue, size_t memory_size) +DLL_INTERFACE_CALL(oneapi_usm_free, void, SyclQueue *queue, void *usm_ptr) + +DLL_INTERFACE_CALL( + oneapi_usm_memcpy, bool, SyclQueue *queue, void *dest, void *src, size_t num_bytes) +DLL_INTERFACE_CALL(oneapi_queue_synchronize, bool, SyclQueue *queue) +DLL_INTERFACE_CALL(oneapi_usm_memset, + bool, + SyclQueue *queue, + void *usm_ptr, + unsigned char value, + size_t num_bytes) + +DLL_INTERFACE_CALL(oneapi_run_test_kernel, bool, SyclQueue *queue) + +/* Operation with Kernel globals structure - map of global/constant allocation - filled before + * render/kernel execution As we don't know in cycles sizeof this - Cycles will manage just as + * pointer. */ +DLL_INTERFACE_CALL(oneapi_kernel_globals_size, bool, SyclQueue *queue, size_t &kernel_global_size) +DLL_INTERFACE_CALL(oneapi_set_global_memory, + void, + SyclQueue *queue, + void *kernel_globals, + const char *memory_name, + void *memory_device_pointer) + +DLL_INTERFACE_CALL(oneapi_kernel_preferred_local_size, + size_t, + SyclQueue *queue, + const DeviceKernel kernel, + const size_t kernel_global_size) +DLL_INTERFACE_CALL(oneapi_enqueue_kernel, + bool, + KernelContext *context, + int kernel, + size_t global_size, + void **args) diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h new file mode 100644 index 00000000000..d60f4f135ba --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/globals.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#pragma once + +#include "kernel/integrator/state.h" +#include "kernel/types.h" +#include "kernel/util/profiling.h" + +CCL_NAMESPACE_BEGIN + +/* NOTE(@nsirgien): With SYCL we can't declare __constant__ global variable, which will be + * accessible from device code, like it has been done for Cycles CUDA backend. So, the backend will + * allocate this "constant" memory regions and store pointers to them in oneAPI context class */ + +struct IntegratorStateGPU; +struct IntegratorQueueCounter; + +typedef struct KernelGlobalsGPU { + +#define KERNEL_DATA_ARRAY(type, name) const type *__##name = nullptr; +#include "kernel/data_arrays.h" +#undef KERNEL_DATA_ARRAY + IntegratorStateGPU *integrator_state; + const KernelData *__data; +#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED + size_t nd_item_local_id_0; + size_t nd_item_local_range_0; + size_t nd_item_group_0; + size_t nd_item_group_range_0; + + size_t nd_item_global_id_0; + size_t nd_item_global_range_0; +#endif +} KernelGlobalsGPU; + +typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals; + +#define kernel_data (*(__data)) +#define kernel_integrator_state (*(integrator_state)) + +/* data lookup defines */ + +#define kernel_data_fetch(name, index) __##name[index] +#define kernel_data_array(name) __##name + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/oneapi/image.h b/intern/cycles/kernel/device/oneapi/image.h new file mode 100644 index 00000000000..892558d40bf --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/image.h @@ -0,0 +1,385 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +CCL_NAMESPACE_BEGIN + +/* For oneAPI implementation we do manual lookup and interpolation. */ +/* TODO: share implementation with ../cpu/image.h. */ + +template ccl_device_forceinline T tex_fetch(const TextureInfo &info, int index) +{ + return reinterpret_cast(info.data)[index]; +} + +ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width) +{ + x %= width; + if (x < 0) + x += width; + return x; +} + +ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width) +{ + return clamp(x, 0, width - 1); +} + +ccl_device_inline float4 svm_image_texture_read(const TextureInfo &info, int x, int y, int z) +{ + const int data_offset = x + info.width * y + info.width * info.height * z; + const int texture_type = info.data_type; + + /* Float4 */ + if (texture_type == IMAGE_DATA_TYPE_FLOAT4) { + return tex_fetch(info, data_offset); + } + /* Byte4 */ + else if (texture_type == IMAGE_DATA_TYPE_BYTE4) { + uchar4 r = tex_fetch(info, data_offset); + float f = 1.0f / 255.0f; + return make_float4(r.x * f, r.y * f, r.z * f, r.w * f); + } + /* Ushort4 */ + else if (texture_type == IMAGE_DATA_TYPE_USHORT4) { + ushort4 r = tex_fetch(info, data_offset); + float f = 1.0f / 65535.f; + return make_float4(r.x * f, r.y * f, r.z * f, r.w * f); + } + /* Float */ + else if (texture_type == IMAGE_DATA_TYPE_FLOAT) { + float f = tex_fetch(info, data_offset); + return make_float4(f, f, f, 1.0f); + } + /* UShort */ + else if (texture_type == IMAGE_DATA_TYPE_USHORT) { + ushort r = tex_fetch(info, data_offset); + float f = r * (1.0f / 65535.0f); + return make_float4(f, f, f, 1.0f); + } + else if (texture_type == IMAGE_DATA_TYPE_HALF) { + float f = tex_fetch(info, data_offset); + return make_float4(f, f, f, 1.0f); + } + else if (texture_type == IMAGE_DATA_TYPE_HALF4) { + half4 r = tex_fetch(info, data_offset); + return make_float4(r.x, r.y, r.z, r.w); + } + /* Byte */ + else { + uchar r = tex_fetch(info, data_offset); + float f = r * (1.0f / 255.0f); + return make_float4(f, f, f, 1.0f); + } +} + +ccl_device_inline float4 svm_image_texture_read_2d(int id, int x, int y) +{ + const TextureInfo &info = kernel_data_fetch(texture_info, id); + + /* Wrap */ + if (info.extension == EXTENSION_REPEAT) { + x = svm_image_texture_wrap_periodic(x, info.width); + y = svm_image_texture_wrap_periodic(y, info.height); + } + else { + x = svm_image_texture_wrap_clamp(x, info.width); + y = svm_image_texture_wrap_clamp(y, info.height); + } + + return svm_image_texture_read(info, x, y, 0); +} + +ccl_device_inline float4 svm_image_texture_read_3d(int id, int x, int y, int z) +{ + const TextureInfo &info = kernel_data_fetch(texture_info, id); + + /* Wrap */ + if (info.extension == EXTENSION_REPEAT) { + x = svm_image_texture_wrap_periodic(x, info.width); + y = svm_image_texture_wrap_periodic(y, info.height); + z = svm_image_texture_wrap_periodic(z, info.depth); + } + else { + x = svm_image_texture_wrap_clamp(x, info.width); + y = svm_image_texture_wrap_clamp(y, info.height); + z = svm_image_texture_wrap_clamp(z, info.depth); + } + + return svm_image_texture_read(info, x, y, z); +} + +static float svm_image_texture_frac(float x, int *ix) +{ + int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0); + *ix = i; + return x - (float)i; +} + +#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ + { \ + u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \ + u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \ + u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \ + u[3] = (1.0f / 6.0f) * t * t * t; \ + } \ + (void)0 + +ccl_device float4 kernel_tex_image_interp(KernelGlobals, int id, float x, float y) +{ + const TextureInfo &info = kernel_data_fetch(texture_info, id); + + if (info.extension == EXTENSION_CLIP) { + if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + + if (info.interpolation == INTERPOLATION_CLOSEST) { + /* Closest interpolation. */ + int ix, iy; + svm_image_texture_frac(x * info.width, &ix); + svm_image_texture_frac(y * info.height, &iy); + + return svm_image_texture_read_2d(id, ix, iy); + } + else if (info.interpolation == INTERPOLATION_LINEAR) { + /* Bilinear interpolation. */ + int ix, iy; + float tx = svm_image_texture_frac(x * info.width - 0.5f, &ix); + float ty = svm_image_texture_frac(y * info.height - 0.5f, &iy); + + float4 r; + r = (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_2d(id, ix, iy); + r += (1.0f - ty) * tx * svm_image_texture_read_2d(id, ix + 1, iy); + r += ty * (1.0f - tx) * svm_image_texture_read_2d(id, ix, iy + 1); + r += ty * tx * svm_image_texture_read_2d(id, ix + 1, iy + 1); + return r; + } + else { + /* Bicubic interpolation. */ + int ix, iy; + float tx = svm_image_texture_frac(x * info.width - 0.5f, &ix); + float ty = svm_image_texture_frac(y * info.height - 0.5f, &iy); + + float u[4], v[4]; + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + + float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + float weight = u[x] * v[y]; + r += weight * svm_image_texture_read_2d(id, ix + x - 1, iy + y - 1); + } + } + return r; + } +} + +#ifdef WITH_NANOVDB +template struct NanoVDBInterpolator { + + typedef typename nanovdb::NanoGrid::AccessorType AccessorType; + + static ccl_always_inline float4 read(float r) + { + return make_float4(r, r, r, 1.0f); + } + + static ccl_always_inline float4 read(nanovdb::Vec3f r) + { + return make_float4(r[0], r[1], r[2], 1.0f); + } + + static ccl_always_inline float4 interp_3d_closest(const AccessorType &acc, + float x, + float y, + float z) + { + const nanovdb::Vec3f xyz(x, y, z); + return read(nanovdb::SampleFromVoxels(acc)(xyz)); + } + + static ccl_always_inline float4 interp_3d_linear(const AccessorType &acc, + float x, + float y, + float z) + { + const nanovdb::Vec3f xyz(x - 0.5f, y - 0.5f, z - 0.5f); + return read(nanovdb::SampleFromVoxels(acc)(xyz)); + } + + static float4 interp_3d_cubic(const AccessorType &acc, float x, float y, float z) + { + int ix, iy, iz; + int nix, niy, niz; + int pix, piy, piz; + int nnix, nniy, nniz; + /* Tricubic b-spline interpolation. */ + const float tx = svm_image_texture_frac(x - 0.5f, &ix); + const float ty = svm_image_texture_frac(y - 0.5f, &iy); + const float tz = svm_image_texture_frac(z - 0.5f, &iz); + pix = ix - 1; + piy = iy - 1; + piz = iz - 1; + nix = ix + 1; + niy = iy + 1; + niz = iz + 1; + nnix = ix + 2; + nniy = iy + 2; + nniz = iz + 2; + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {piy, iy, niy, nniy}; + const int zc[4] = {piz, iz, niz, nniz}; + float u[4], v[4], w[4]; + + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +# define DATA(x, y, z) (read(acc.getValue(nanovdb::Coord(xc[x], yc[y], zc[z])))) +# define COL_TERM(col, row) \ + (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \ + u[3] * DATA(3, col, row))) +# define ROW_TERM(row) \ + (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(w, tz); + + /* Actual interpolation. */ + return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); + +# undef COL_TERM +# undef ROW_TERM +# undef DATA + } + + static ccl_always_inline float4 + interp_3d(const TextureInfo &info, float x, float y, float z, int interp) + { + using namespace nanovdb; + + NanoGrid *const grid = (NanoGrid *)info.data; + AccessorType acc = grid->getAccessor(); + + switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) { + case INTERPOLATION_CLOSEST: + return interp_3d_closest(acc, x, y, z); + case INTERPOLATION_LINEAR: + return interp_3d_linear(acc, x, y, z); + default: + return interp_3d_cubic(acc, x, y, z); + } + } +}; +#endif /* WITH_NANOVDB */ + +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals, int id, float3 P, int interp) +{ + const TextureInfo &info = kernel_data_fetch(texture_info, id); + + if (info.use_transform_3d) { + Transform tfm = info.transform_3d; + P = transform_point(&tfm, P); + } + + float x = P.x; + float y = P.y; + float z = P.z; + + uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp; + +#ifdef WITH_NANOVDB + if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) { + return NanoVDBInterpolator::interp_3d(info, x, y, z, interpolation); + } + else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { + return NanoVDBInterpolator::interp_3d(info, x, y, z, interpolation); + } + else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FPN) { + return NanoVDBInterpolator::interp_3d(info, x, y, z, interpolation); + } + else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FP16) { + return NanoVDBInterpolator::interp_3d(info, x, y, z, interpolation); + } +#else + if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT || + info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3 || + info.data_type == IMAGE_DATA_TYPE_NANOVDB_FPN || + info.data_type == IMAGE_DATA_TYPE_NANOVDB_FP16) { + return make_float4( + TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A); + } +#endif + else { + if (info.extension == EXTENSION_CLIP) { + if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + + x *= info.width; + y *= info.height; + z *= info.depth; + } + + if (interpolation == INTERPOLATION_CLOSEST) { + /* Closest interpolation. */ + int ix, iy, iz; + svm_image_texture_frac(x, &ix); + svm_image_texture_frac(y, &iy); + svm_image_texture_frac(z, &iz); + + return svm_image_texture_read_3d(id, ix, iy, iz); + } + else if (interpolation == INTERPOLATION_LINEAR) { + /* Trilinear interpolation. */ + int ix, iy, iz; + float tx = svm_image_texture_frac(x - 0.5f, &ix); + float ty = svm_image_texture_frac(y - 0.5f, &iy); + float tz = svm_image_texture_frac(z - 0.5f, &iz); + + float4 r; + r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy, iz); + r += (1.0f - tz) * (1.0f - ty) * tx * svm_image_texture_read_3d(id, ix + 1, iy, iz); + r += (1.0f - tz) * ty * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy + 1, iz); + r += (1.0f - tz) * ty * tx * svm_image_texture_read_3d(id, ix + 1, iy + 1, iz); + + r += tz * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy, iz + 1); + r += tz * (1.0f - ty) * tx * svm_image_texture_read_3d(id, ix + 1, iy, iz + 1); + r += tz * ty * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy + 1, iz + 1); + r += tz * ty * tx * svm_image_texture_read_3d(id, ix + 1, iy + 1, iz + 1); + return r; + } + else { + /* Tricubic interpolation. */ + int ix, iy, iz; + float tx = svm_image_texture_frac(x - 0.5f, &ix); + float ty = svm_image_texture_frac(y - 0.5f, &iy); + float tz = svm_image_texture_frac(z - 0.5f, &iz); + + float u[4], v[4], w[4]; + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(w, tz); + + float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + for (int z = 0; z < 4; z++) { + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + float weight = u[x] * v[y] * w[z]; + r += weight * svm_image_texture_read_3d(id, ix + x - 1, iy + y - 1, iz + z - 1); + } + } + } + return r; + } +} + +#undef SET_CUBIC_SPLINE_WEIGHTS + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp new file mode 100644 index 00000000000..62affe6e58e --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -0,0 +1,884 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#ifdef WITH_ONEAPI + +/* clang-format off */ +# include "kernel.h" +# include +# include +# include + +# include +# include +# include + +# include "kernel/device/oneapi/compat.h" +# include "kernel/device/oneapi/device_id.h" +# include "kernel/device/oneapi/globals.h" +# include "kernel/device/oneapi/kernel_templates.h" + +# include "kernel/device/gpu/kernel.h" +/* clang-format on */ + +static OneAPIErrorCallback s_error_cb = nullptr; +static void *s_error_user_ptr = nullptr; + +static std::vector oneapi_available_devices(); + +void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr) +{ + s_error_cb = cb; + s_error_user_ptr = user_ptr; +} + +void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false) +{ +# ifdef _DEBUG + sycl::queue *queue = reinterpret_cast(queue_); + sycl::info::device_type device_type = + queue->get_device().get_info(); + sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context()); + (void)usm_type; + assert(usm_type == sycl::usm::alloc::device || + ((device_type == sycl::info::device_type::host || + device_type == sycl::info::device_type::is_cpu || allow_host) && + usm_type == sycl::usm::alloc::host)); +# endif +} + +bool oneapi_create_queue(SyclQueue *&external_queue, int device_index) +{ + bool finished_correct = true; + try { + std::vector devices = oneapi_available_devices(); + if (device_index < 0 || device_index >= devices.size()) { + return false; + } + sycl::queue *created_queue = new sycl::queue(devices[device_index], + sycl::property::queue::in_order()); + external_queue = reinterpret_cast(created_queue); + } + catch (sycl::exception const &e) { + finished_correct = false; + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + } + return finished_correct; +} + +void oneapi_free_queue(SyclQueue *queue_) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast(queue_); + delete queue; +} + +void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast(queue_); + return sycl::aligned_alloc_host(alignment, memory_size, *queue); +} + +void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast(queue_); + return sycl::malloc_device(memory_size, *queue); +} + +void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast(queue_); + oneapi_check_usm(queue_, usm_ptr, true); + sycl::free(usm_ptr, *queue); +} + +bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast(queue_); + oneapi_check_usm(queue_, dest, true); + oneapi_check_usm(queue_, src, true); + try { + sycl::event mem_event = queue->memcpy(dest, src, num_bytes); + mem_event.wait_and_throw(); + return true; + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + return false; + } +} + +bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast(queue_); + oneapi_check_usm(queue_, usm_ptr, true); + try { + sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes); + mem_event.wait_and_throw(); + return true; + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + return false; + } +} + +bool oneapi_queue_synchronize(SyclQueue *queue_) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast(queue_); + try { + queue->wait_and_throw(); + return true; + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + return false; + } +} + +/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and + * also trigger runtime compilation of all existing oneAPI kernels */ +bool oneapi_run_test_kernel(SyclQueue *queue_) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast(queue_); + size_t N = 8; + sycl::buffer A(N); + sycl::buffer B(N); + + { + sycl::host_accessor A_host_acc(A, sycl::write_only); + for (size_t i = (size_t)0; i < N; i++) + A_host_acc[i] = rand() % 32; + } + + try { + queue->submit([&](sycl::handler &cgh) { + sycl::accessor A_acc(A, cgh, sycl::read_only); + sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init); + + cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); }); + }); + queue->wait_and_throw(); + + sycl::host_accessor A_host_acc(A, sycl::read_only); + sycl::host_accessor B_host_acc(B, sycl::read_only); + + for (size_t i = (size_t)0; i < N; i++) { + float result = A_host_acc[i] + B_host_acc[i]; + (void)result; + } + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + return false; + } + + return true; +} + +bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size) +{ + kernel_global_size = sizeof(KernelGlobalsGPU); + + return true; +} + +void oneapi_set_global_memory(SyclQueue *queue_, + void *kernel_globals, + const char *memory_name, + void *memory_device_pointer) +{ + assert(queue_); + assert(kernel_globals); + assert(memory_name); + assert(memory_device_pointer); + KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals; + oneapi_check_usm(queue_, memory_device_pointer); + oneapi_check_usm(queue_, kernel_globals, true); + + std::string matched_name(memory_name); + +/* This macro will change global ptr of KernelGlobals via name matching. */ +# define KERNEL_DATA_ARRAY(type, name) \ + else if (#name == matched_name) \ + { \ + globals->__##name = (type *)memory_device_pointer; \ + return; \ + } + if (false) { + } + else if ("integrator_state" == matched_name) { + globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer; + return; + } + KERNEL_DATA_ARRAY(KernelData, data) +# include "kernel/data_arrays.h" + else + { + std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!" + << std::endl; + assert(false); + } +# undef KERNEL_DATA_ARRAY +} + +/* TODO: Move device information to OneapiDevice initialized on creation and use it. */ +/* TODO: Move below function to oneapi/queue.cpp. */ +size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_, + const DeviceKernel kernel, + const size_t kernel_global_size) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast(queue_); + (void)kernel_global_size; + const static size_t preferred_work_group_size_intersect_shading = 32; + const static size_t preferred_work_group_size_technical = 1024; + + size_t preferred_work_group_size = 0; + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA: + case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: + case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: + case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: + case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: + preferred_work_group_size = preferred_work_group_size_intersect_shading; + break; + + case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES: + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES: + case DEVICE_KERNEL_INTEGRATOR_RESET: + case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS: + preferred_work_group_size = preferred_work_group_size_technical; + break; + + default: + preferred_work_group_size = 512; + } + + const size_t limit_work_group_size = + queue->get_device().get_info(); + return std::min(limit_work_group_size, preferred_work_group_size); +} + +bool oneapi_enqueue_kernel(KernelContext *kernel_context, + int kernel, + size_t global_size, + void **args) +{ + bool success = true; + ::DeviceKernel device_kernel = (::DeviceKernel)kernel; + KernelGlobalsGPU *kg = (KernelGlobalsGPU *)kernel_context->kernel_globals; + sycl::queue *queue = reinterpret_cast(kernel_context->queue); + assert(queue); + if (!queue) { + return false; + } + + size_t local_size = oneapi_kernel_preferred_local_size( + kernel_context->queue, device_kernel, global_size); + assert(global_size % local_size == 0); + + /* Local size for DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY needs to be enforced so we + * overwrite it outside of oneapi_kernel_preferred_local_size. */ + if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY) { + local_size = GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE; + } + + /* Kernels listed below need a specific number of work groups. */ + if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY || + device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY || + device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY || + device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY || + device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY || + device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY || + device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY) { + int num_states = *((int *)(args[0])); + /* Round up to the next work-group. */ + size_t groups_count = (num_states + local_size - 1) / local_size; + /* NOTE(@nsirgien): As for now non-uniform workgroups don't work on most oneAPI devices, we + * extend work size to fit uniformity requirements. */ + global_size = groups_count * local_size; + +# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED + if (queue->get_device().is_host()) { + global_size = 1; + local_size = 1; + } +# endif + } + + /* Let the compiler throw an error if there are any kernels missing in this implementation. */ +# if defined(_WIN32) +# pragma warning(error : 4062) +# elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic error "-Wswitch" +# endif + + try { + queue->submit([&](sycl::handler &cgh) { + switch (device_kernel) { + case DEVICE_KERNEL_INTEGRATOR_RESET: { + oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_reset); + break; + } + case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_camera); + break; + } + case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_bake); + break; + } + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_closest); + break; + } + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_shadow); + break; + } + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_intersect_subsurface); + break; + } + case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_intersect_volume_stack); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_background); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_light); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_shadow); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_shade_surface_raytrace); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface_mnee); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_volume); + break; + } + case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_queued_paths_array); + break; + } + case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_queued_shadow_paths_array); + break; + } + case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_active_paths_array); + break; + } + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_terminated_paths_array); + break; + } + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_terminated_shadow_paths_array); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array); + break; + } + case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_compact_paths_array); + break; + } + case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_compact_shadow_paths_array); + break; + } + case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_adaptive_sampling_convergence_check); + break; + } + case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_x); + break; + } + case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_y); + break; + } + case DEVICE_KERNEL_SHADER_EVAL_DISPLACE: { + oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_displace); + break; + } + case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_background); + break; + } + case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_shader_eval_curve_shadow_transparency); + break; + } + case DEVICE_KERNEL_PREFIX_SUM: { + oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_prefix_sum); + break; + } + + /* clang-format off */ + # define DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \ + case DEVICE_KERNEL_FILM_CONVERT_##VARIANT: { \ + oneapi_call(kg, cgh, \ + global_size, \ + local_size, \ + args, \ + oneapi_kernel_film_convert_##variant); \ + break; \ + } + +# define DEVICE_KERNEL_FILM_CONVERT(variant, VARIANT) \ + DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \ + DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT##_HALF_RGBA, variant##_half_rgba) + + DEVICE_KERNEL_FILM_CONVERT(depth, DEPTH); + DEVICE_KERNEL_FILM_CONVERT(mist, MIST); + DEVICE_KERNEL_FILM_CONVERT(sample_count, SAMPLE_COUNT); + DEVICE_KERNEL_FILM_CONVERT(float, FLOAT); + DEVICE_KERNEL_FILM_CONVERT(light_path, LIGHT_PATH); + DEVICE_KERNEL_FILM_CONVERT(float3, FLOAT3); + DEVICE_KERNEL_FILM_CONVERT(motion, MOTION); + DEVICE_KERNEL_FILM_CONVERT(cryptomatte, CRYPTOMATTE); + DEVICE_KERNEL_FILM_CONVERT(shadow_catcher, SHADOW_CATCHER); + DEVICE_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow, + SHADOW_CATCHER_MATTE_WITH_SHADOW); + DEVICE_KERNEL_FILM_CONVERT(combined, COMBINED); + DEVICE_KERNEL_FILM_CONVERT(float4, FLOAT4); + +# undef DEVICE_KERNEL_FILM_CONVERT +# undef DEVICE_KERNEL_FILM_CONVERT_PARTIAL + /* clang-format on */ + + case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_filter_guiding_preprocess); + break; + } + case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_filter_guiding_set_fake_albedo); + break; + } + case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_preprocess); + break; + } + case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_postprocess); + break; + } + case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_cryptomatte_postprocess); + break; + } + case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES: { + oneapi_call( + kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_compact_states); + break; + } + case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_compact_shadow_states); + break; + } + case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS: { + oneapi_call(kg, + cgh, + global_size, + local_size, + args, + oneapi_kernel_integrator_shadow_catcher_count_possible_splits); + break; + } + /* Unsupported kernels */ + case DEVICE_KERNEL_NUM: + case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL: + assert(0); + return false; + } + + /* Unknown kernel. */ + assert(0); + return false; + }); + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + success = false; + } + } + +# if defined(_WIN32) +# pragma warning(default : 4062) +# elif defined(__GNUC__) +# pragma GCC diagnostic pop +# endif + return success; +} + +static const int lowest_supported_driver_version_win = 1011660; +static const int lowest_supported_driver_version_neo = 20066; + +static int parse_driver_build_version(const sycl::device &device) +{ + const std::string &driver_version = device.get_info(); + int driver_build_version = 0; + + size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1); + if (second_dot_position == std::string::npos) { + std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version + << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," + << " xx.xx.xxx.xxxx (Windows) for device \"" + << device.get_info() << "\"." << std::endl; + } + else { + try { + size_t third_dot_position = driver_version.find('.', second_dot_position + 1); + if (third_dot_position != std::string::npos) { + const std::string &third_number_substr = driver_version.substr( + second_dot_position + 1, third_dot_position - second_dot_position - 1); + const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1); + if (third_number_substr.length() == 3 && forth_number_substr.length() == 4) + driver_build_version = std::stoi(third_number_substr) * 10000 + + std::stoi(forth_number_substr); + } + else { + const std::string &third_number_substr = driver_version.substr(second_dot_position + 1); + driver_build_version = std::stoi(third_number_substr); + } + } + catch (std::invalid_argument &e) { + std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version + << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," + << " xx.xx.xxx.xxxx (Windows) for device \"" + << device.get_info() << "\"." << std::endl; + } + } + + return driver_build_version; +} + +static std::vector oneapi_available_devices() +{ + bool allow_all_devices = false; + if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) + allow_all_devices = true; + + /* Host device is useful only for debugging at the moment + * so we hide this device with default build settings. */ +# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED + bool allow_host = true; +# else + bool allow_host = false; +# endif + + const std::vector &oneapi_platforms = sycl::platform::get_platforms(); + + std::vector available_devices; + for (const sycl::platform &platform : oneapi_platforms) { + /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL. + */ + if (platform.get_backend() == sycl::backend::opencl) { + continue; + } + + const std::vector &oneapi_devices = + (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) : + platform.get_devices(sycl::info::device_type::gpu); + + for (const sycl::device &device : oneapi_devices) { + if (allow_all_devices) { + /* still filter out host device if build doesn't support it. */ + if (allow_host || !device.is_host()) { + available_devices.push_back(device); + } + } + else { + bool filter_out = false; + + /* For now we support all Intel(R) Arc(TM) devices + * and any future GPU with more than 128 execution units + * official support can be broaden to older and smaller GPUs once ready. */ + if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) { + ze_device_handle_t ze_device = sycl::get_native( + device); + ze_device_properties_t props = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES}; + zeDeviceGetProperties(ze_device, &props); + bool is_dg2 = (intel_arc_alchemist_device_ids.find(props.deviceId) != + intel_arc_alchemist_device_ids.end()); + int number_of_eus = props.numEUsPerSubslice * props.numSubslicesPerSlice * + props.numSlices; + if (!is_dg2 || number_of_eus < 128) + filter_out = true; + + /* if not already filtered out, check driver version. */ + if (!filter_out) { + int driver_build_version = parse_driver_build_version(device); + if ((driver_build_version > 100000 && + driver_build_version < lowest_supported_driver_version_win) || + (driver_build_version > 0 && + driver_build_version < lowest_supported_driver_version_neo)) { + filter_out = true; + } + } + } + else if (!allow_host && device.is_host()) { + filter_out = true; + } + else if (!allow_all_devices) { + filter_out = true; + } + + if (!filter_out) { + available_devices.push_back(device); + } + } + } + } + + return available_devices; +} + +char *oneapi_device_capabilities() +{ + std::stringstream capabilities; + + const std::vector &oneapi_devices = oneapi_available_devices(); + for (const sycl::device &device : oneapi_devices) { + const std::string &name = device.get_info(); + + capabilities << std::string("\t") << name << "\n"; +# define WRITE_ATTR(attribute_name, attribute_variable) \ + capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \ + << "\n"; +# define GET_NUM_ATTR(attribute) \ + { \ + size_t attribute = (size_t)device.get_info(); \ + capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \ + } + + GET_NUM_ATTR(vendor_id) + GET_NUM_ATTR(max_compute_units) + GET_NUM_ATTR(max_work_item_dimensions) + + sycl::id<3> max_work_item_sizes = device.get_info(); + WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0))) + WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1))) + WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2))) + + GET_NUM_ATTR(max_work_group_size) + GET_NUM_ATTR(max_num_sub_groups) + GET_NUM_ATTR(sub_group_independent_forward_progress) + + GET_NUM_ATTR(preferred_vector_width_char) + GET_NUM_ATTR(preferred_vector_width_short) + GET_NUM_ATTR(preferred_vector_width_int) + GET_NUM_ATTR(preferred_vector_width_long) + GET_NUM_ATTR(preferred_vector_width_float) + GET_NUM_ATTR(preferred_vector_width_double) + GET_NUM_ATTR(preferred_vector_width_half) + + GET_NUM_ATTR(native_vector_width_char) + GET_NUM_ATTR(native_vector_width_short) + GET_NUM_ATTR(native_vector_width_int) + GET_NUM_ATTR(native_vector_width_long) + GET_NUM_ATTR(native_vector_width_float) + GET_NUM_ATTR(native_vector_width_double) + GET_NUM_ATTR(native_vector_width_half) + + size_t max_clock_frequency = + (size_t)(device.is_host() ? (size_t)0 : + device.get_info()); + WRITE_ATTR("max_clock_frequency", max_clock_frequency) + + GET_NUM_ATTR(address_bits) + GET_NUM_ATTR(max_mem_alloc_size) + + /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't + * supported so we always return false, even if device supports HW texture usage acceleration. + */ + bool image_support = false; + WRITE_ATTR("image_support", (size_t)image_support) + + GET_NUM_ATTR(max_parameter_size) + GET_NUM_ATTR(mem_base_addr_align) + GET_NUM_ATTR(global_mem_size) + GET_NUM_ATTR(local_mem_size) + GET_NUM_ATTR(error_correction_support) + GET_NUM_ATTR(profiling_timer_resolution) + GET_NUM_ATTR(is_available) + +# undef GET_NUM_ATTR +# undef WRITE_ATTR + capabilities << "\n"; + } + + return ::strdup(capabilities.str().c_str()); +} + +void oneapi_free(void *p) +{ + if (p) { + ::free(p); + } +} + +void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr) +{ + int num = 0; + std::vector devices = oneapi_available_devices(); + for (sycl::device &device : devices) { + const std::string &platform_name = + device.get_platform().get_info(); + std::string name = device.get_info(); + std::string id = "ONEAPI_" + platform_name + "_" + name; + (cb)(id.c_str(), name.c_str(), num, user_ptr); + num++; + } +} + +size_t oneapi_get_memcapacity(SyclQueue *queue) +{ + return reinterpret_cast(queue) + ->get_device() + .get_info(); +} + +size_t oneapi_get_compute_units_amount(SyclQueue *queue) +{ + return reinterpret_cast(queue) + ->get_device() + .get_info(); +} + +#endif /* WITH_ONEAPI */ diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h new file mode 100644 index 00000000000..c5f853742ed --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/kernel.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#pragma once + +#ifdef WITH_ONEAPI + +# include + +/* NOTE(@nsirgien): Should match underlying type in the declaration inside "kernel/types.h" + * TODO: use kernel/types.h directly. */ +enum DeviceKernel : int; + +# ifndef CYCLES_KERNEL_ONEAPI_EXPORT +# ifdef _WIN32 +# if defined(ONEAPI_EXPORT) +# define CYCLES_KERNEL_ONEAPI_EXPORT extern __declspec(dllexport) +# else +# define CYCLES_KERNEL_ONEAPI_EXPORT extern __declspec(dllimport) +# endif +# else +# define CYCLES_KERNEL_ONEAPI_EXPORT +# endif +# endif + +class SyclQueue; + +typedef void (*OneAPIDeviceIteratorCallback)(const char *id, + const char *name, + int num, + void *user_ptr); + +typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr); + +struct KernelContext { + /* Queue, associated with selected device */ + SyclQueue *queue; + /* Pointer to USM device memory with all global/constant allocation on this device */ + void *kernel_globals; +}; + +/* Use extern C linking so that the symbols can be easily load from the dynamic library at runtime. + */ +# ifdef __cplusplus +extern "C" { +# endif + +# define DLL_INTERFACE_CALL(function, return_type, ...) \ + CYCLES_KERNEL_ONEAPI_EXPORT return_type function(__VA_ARGS__); +# include "kernel/device/oneapi/dll_interface_template.h" +# undef DLL_INTERFACE_CALL + +# ifdef __cplusplus +} +# endif + +#endif /* WITH_ONEAPI */ diff --git a/intern/cycles/kernel/device/oneapi/kernel_templates.h b/intern/cycles/kernel/device/oneapi/kernel_templates.h new file mode 100644 index 00000000000..2dfc96292ed --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/kernel_templates.h @@ -0,0 +1,121 @@ +#pragma once + +/* Some macro magic to generate templates for kernel arguments. + The resulting oneapi_call() template allows to call a SYCL/C++ kernel + with typed arguments by only giving it a void **args as given by Cycles. + The template will automatically cast from void* to the expectd type. + */ + +/* When expanded by the preprocessor, the generated templates will look like this example: */ +#if 0 +template +void oneapi_call( + KernelGlobalsGPU *kg, + sycl::handler &cgh, + size_t global_size, + size_t local_size, + void **args, + void (*func)(const KernelGlobalsGPU *, size_t, size_t, sycl::handler &, T0, T1, T2)) +{ + func(kg, global_size, local_size, cgh, *(T0 *)(args[0]), *(T1 *)(args[1]), *(T2 *)(args[2])); +} +#endif + +/* clang-format off */ +#define ONEAPI_TYP(x) typename T##x +#define ONEAPI_CAST(x) *(T##x *)(args[x]) +#define ONEAPI_T(x) T##x + +#define ONEAPI_GET_NTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, N, ...) N +#define ONEAPI_0(_call, ...) +#define ONEAPI_1(_call, x) _call(x) +#define ONEAPI_2(_call, x, ...) _call(x), ONEAPI_1(_call, __VA_ARGS__) +#define ONEAPI_3(_call, x, ...) _call(x), ONEAPI_2(_call, __VA_ARGS__) +#define ONEAPI_4(_call, x, ...) _call(x), ONEAPI_3(_call, __VA_ARGS__) +#define ONEAPI_5(_call, x, ...) _call(x), ONEAPI_4(_call, __VA_ARGS__) +#define ONEAPI_6(_call, x, ...) _call(x), ONEAPI_5(_call, __VA_ARGS__) +#define ONEAPI_7(_call, x, ...) _call(x), ONEAPI_6(_call, __VA_ARGS__) +#define ONEAPI_8(_call, x, ...) _call(x), ONEAPI_7(_call, __VA_ARGS__) +#define ONEAPI_9(_call, x, ...) _call(x), ONEAPI_8(_call, __VA_ARGS__) +#define ONEAPI_10(_call, x, ...) _call(x), ONEAPI_9(_call, __VA_ARGS__) +#define ONEAPI_11(_call, x, ...) _call(x), ONEAPI_10(_call, __VA_ARGS__) +#define ONEAPI_12(_call, x, ...) _call(x), ONEAPI_11(_call, __VA_ARGS__) +#define ONEAPI_13(_call, x, ...) _call(x), ONEAPI_12(_call, __VA_ARGS__) +#define ONEAPI_14(_call, x, ...) _call(x), ONEAPI_13(_call, __VA_ARGS__) +#define ONEAPI_15(_call, x, ...) _call(x), ONEAPI_14(_call, __VA_ARGS__) +#define ONEAPI_16(_call, x, ...) _call(x), ONEAPI_15(_call, __VA_ARGS__) +#define ONEAPI_17(_call, x, ...) _call(x), ONEAPI_16(_call, __VA_ARGS__) +#define ONEAPI_18(_call, x, ...) _call(x), ONEAPI_17(_call, __VA_ARGS__) +#define ONEAPI_19(_call, x, ...) _call(x), ONEAPI_18(_call, __VA_ARGS__) +#define ONEAPI_20(_call, x, ...) _call(x), ONEAPI_19(_call, __VA_ARGS__) +#define ONEAPI_21(_call, x, ...) _call(x), ONEAPI_20(_call, __VA_ARGS__) + +#define ONEAPI_CALL_FOR(x, ...) \ + ONEAPI_GET_NTH_ARG("ignored", \ + ##__VA_ARGS__, \ + ONEAPI_21, \ + ONEAPI_20, \ + ONEAPI_19, \ + ONEAPI_18, \ + ONEAPI_17, \ + ONEAPI_16, \ + ONEAPI_15, \ + ONEAPI_14, \ + ONEAPI_13, \ + ONEAPI_12, \ + ONEAPI_11, \ + ONEAPI_10, \ + ONEAPI_9, \ + ONEAPI_8, \ + ONEAPI_7, \ + ONEAPI_6, \ + ONEAPI_5, \ + ONEAPI_4, \ + ONEAPI_3, \ + ONEAPI_2, \ + ONEAPI_1, \ + ONEAPI_0) \ + (x, ##__VA_ARGS__) + +/* This template automatically casts entries in the void **args array to the types requested by the kernel func. + Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */ +#define oneapi_template(...) \ + template \ + void oneapi_call( \ + KernelGlobalsGPU *kg, \ + sycl::handler &cgh, \ + size_t global_size, \ + size_t local_size, \ + void **args, \ + void (*func)(KernelGlobalsGPU*, size_t, size_t, sycl::handler &, ONEAPI_CALL_FOR(ONEAPI_T, __VA_ARGS__))) \ + { \ + func(kg, \ + global_size, \ + local_size, \ + cgh, \ + ONEAPI_CALL_FOR(ONEAPI_CAST, __VA_ARGS__)); \ + } + +oneapi_template(0) +oneapi_template(0, 1) +oneapi_template(0, 1, 2) +oneapi_template(0, 1, 2, 3) +oneapi_template(0, 1, 2, 3, 4) +oneapi_template(0, 1, 2, 3, 4, 5) +oneapi_template(0, 1, 2, 3, 4, 5, 6) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19) +oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20) + + /* clang-format on */ diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index ad022716207..f2e61d25002 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -1571,7 +1571,7 @@ static_assert_align(KernelShaderEvalInput, 16); * If the kernel uses shared CUDA memory, `CUDADeviceQueue::enqueue` is to be modified. * The path iteration kernels are handled in `PathTraceWorkGPU::enqueue_path_iteration`. */ -typedef enum DeviceKernel { +typedef enum DeviceKernel : int { DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA = 0, DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST, -- cgit v1.2.3