From 305b92e05f748a0fd9cb62b9829791d717ba2d57 Mon Sep 17 00:00:00 2001 From: Xavier Hallade Date: Fri, 21 Oct 2022 14:10:25 +0200 Subject: Cycles: oneAPI: remove use of SYCL host device Host device is deprecated in SYCL 2020 spec, cpu device or standard C++ should be used instead. --- CMakeLists.txt | 2 - intern/cycles/device/oneapi/device.cpp | 4 +- intern/cycles/device/oneapi/device_impl.cpp | 30 +++------------ intern/cycles/kernel/CMakeLists.txt | 4 -- .../kernel/device/gpu/parallel_active_index.h | 33 +++------------- intern/cycles/kernel/device/oneapi/compat.h | 45 +++++----------------- intern/cycles/kernel/device/oneapi/globals.h | 9 ----- intern/cycles/kernel/device/oneapi/kernel.cpp | 7 ---- 8 files changed, 21 insertions(+), 113 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9134c7c1ed6..a3ea162d040 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -489,14 +489,12 @@ endif() if(NOT APPLE) option(WITH_CYCLES_DEVICE_ONEAPI "Enable Cycles oneAPI compute support" OFF) option(WITH_CYCLES_ONEAPI_BINARIES "Enable Ahead-Of-Time compilation for Cycles oneAPI device" OFF) - option(WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED "Enable use of SYCL host (CPU) device execution by oneAPI implementation. This option is for debugging purposes and impacts GPU execution." OFF) # https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html # acm-g10 is the architecture for the first Arc Alchemist GPUs but we'll keep using dg2 until IGC dependency is updated to support acm-g10. set(CYCLES_ONEAPI_SPIR64_GEN_DEVICES "dg2" CACHE STRING "oneAPI Intel GPU architectures to build binaries for") set(CYCLES_ONEAPI_SYCL_TARGETS spir64 spir64_gen CACHE STRING "oneAPI targets to build AOT binaries for") - mark_as_advanced(WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED) mark_as_advanced(CYCLES_ONEAPI_SPIR64_GEN_DEVICES) mark_as_advanced(CYCLES_ONEAPI_SYCL_TARGETS) endif() diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp index f303ab41627..66d6f749e30 100644 --- a/intern/cycles/device/oneapi/device.cpp +++ b/intern/cycles/device/oneapi/device.cpp @@ -39,7 +39,7 @@ bool device_oneapi_init() _putenv_s("SYCL_CACHE_THRESHOLD", "0"); } if (getenv("SYCL_DEVICE_FILTER") == nullptr) { - _putenv_s("SYCL_DEVICE_FILTER", "host,level_zero"); + _putenv_s("SYCL_DEVICE_FILTER", "level_zero"); } if (getenv("SYCL_ENABLE_PCI") == nullptr) { _putenv_s("SYCL_ENABLE_PCI", "1"); @@ -50,7 +50,7 @@ bool device_oneapi_init() # elif __linux__ setenv("SYCL_CACHE_PERSISTENT", "1", false); setenv("SYCL_CACHE_THRESHOLD", "0", false); - setenv("SYCL_DEVICE_FILTER", "host,level_zero", false); + setenv("SYCL_DEVICE_FILTER", "level_zero", false); setenv("SYCL_ENABLE_PCI", "1", false); setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false); # endif diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp index f14eada071d..4e7849e6b9a 100644 --- a/intern/cycles/device/oneapi/device_impl.cpp +++ b/intern/cycles/device/oneapi/device_impl.cpp @@ -430,8 +430,7 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_ sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context()); (void)usm_type; assert(usm_type == sycl::usm::alloc::device || - ((device_type == sycl::info::device_type::host || - device_type == sycl::info::device_type::cpu || allow_host) && + ((device_type == sycl::info::device_type::cpu || allow_host) && usm_type == sycl::usm::alloc::host || usm_type == sycl::usm::alloc::unknown)); # else @@ -672,14 +671,6 @@ std::vector OneapiDevice::available_devices() if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) allow_all_devices = true; - /* Host device is useful only for debugging at the moment - * so we hide this device with default build settings. */ -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - bool allow_host = true; -# else - bool allow_host = false; -# endif - const std::vector &oneapi_platforms = sycl::platform::get_platforms(); std::vector available_devices; @@ -691,17 +682,11 @@ std::vector OneapiDevice::available_devices() } const std::vector &oneapi_devices = - (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) : - platform.get_devices(sycl::info::device_type::gpu); + (allow_all_devices) ? platform.get_devices(sycl::info::device_type::all) : + platform.get_devices(sycl::info::device_type::gpu); for (const sycl::device &device : oneapi_devices) { - if (allow_all_devices) { - /* still filter out host device if build doesn't support it. */ - if (allow_host || !device.is_host()) { - available_devices.push_back(device); - } - } - else { + if (!allow_all_devices) { bool filter_out = false; /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU, @@ -733,9 +718,6 @@ std::vector OneapiDevice::available_devices() } } } - else if (!allow_host && device.is_host()) { - filter_out = true; - } else if (!allow_all_devices) { filter_out = true; } @@ -798,9 +780,7 @@ char *OneapiDevice::device_capabilities() GET_NUM_ATTR(native_vector_width_double) GET_NUM_ATTR(native_vector_width_half) - size_t max_clock_frequency = - (size_t)(device.is_host() ? (size_t)0 : - device.get_info()); + size_t max_clock_frequency = device.get_info(); WRITE_ATTR("max_clock_frequency", max_clock_frequency) GET_NUM_ATTR(address_bits) diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 1e69d14b1b7..b6a53117a3b 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -752,10 +752,6 @@ if(WITH_CYCLES_DEVICE_ONEAPI) ${SYCL_CPP_FLAGS} ) - if (WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED) - list(APPEND sycl_compiler_flags -DWITH_ONEAPI_SYCL_HOST_ENABLED) - endif() - # Set defaults for spir64 and spir64_gen options if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64) set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'") diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index c1df49c4f49..38cdcb572eb 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -23,22 +23,6 @@ CCL_NAMESPACE_BEGIN * and keep device specific code in compat.h */ #ifdef __KERNEL_ONEAPI__ -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED -template -void cpu_serial_active_index_array_impl(const uint num_states, - ccl_global int *ccl_restrict indices, - ccl_global int *ccl_restrict num_indices, - IsActiveOp is_active_op) -{ - int write_index = 0; - for (int state_index = 0; state_index < num_states; state_index++) { - if (is_active_op(state_index)) - indices[write_index++] = state_index; - } - *num_indices = write_index; - return; -} -# endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */ template void gpu_parallel_active_index_array_impl(const uint num_states, @@ -182,18 +166,11 @@ __device__ num_simd_groups, \ simdgroup_offset) #elif defined(__KERNEL_ONEAPI__) -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED -# define gpu_parallel_active_index_array( \ - blocksize, num_states, indices, num_indices, is_active_op) \ - if (ccl_gpu_global_size_x() == 1) \ - cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \ - else \ - gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op); -# else -# define gpu_parallel_active_index_array( \ - blocksize, num_states, indices, num_indices, is_active_op) \ - gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op) -# endif + +# define gpu_parallel_active_index_array( \ + blocksize, num_states, indices, num_indices, is_active_op) \ + gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op) + #else # define gpu_parallel_active_index_array( \ diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h index 8ae40b0612e..dfaec65130c 100644 --- a/intern/cycles/kernel/device/oneapi/compat.h +++ b/intern/cycles/kernel/device/oneapi/compat.h @@ -55,18 +55,6 @@ #define ccl_gpu_kernel(block_num_threads, thread_num_registers) #define ccl_gpu_kernel_threads(block_num_threads) -#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED -# define KG_ND_ITEMS \ - kg->nd_item_local_id_0 = item.get_local_id(0); \ - kg->nd_item_local_range_0 = item.get_local_range(0); \ - kg->nd_item_group_0 = item.get_group(0); \ - kg->nd_item_group_range_0 = item.get_group_range(0); \ - kg->nd_item_global_id_0 = item.get_global_id(0); \ - kg->nd_item_global_range_0 = item.get_global_range(0); -#else -# define KG_ND_ITEMS -#endif - #define ccl_gpu_kernel_signature(name, ...) \ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ size_t kernel_global_size, \ @@ -76,8 +64,7 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ (kg); \ cgh.parallel_for( \ sycl::nd_range<1>(kernel_global_size, kernel_local_size), \ - [=](sycl::nd_item<1> item) { \ - KG_ND_ITEMS + [=](sycl::nd_item<1> item) { #define ccl_gpu_kernel_postfix \ }); \ @@ -95,31 +82,17 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg) /* GPU thread, block, grid size and index */ -#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED -# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0)) -# define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0)) -# define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0)) -# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0)) -# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) -# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) - -# define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0)) -# define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0)) -#else -# define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0) -# define ccl_gpu_block_dim_x (kg->nd_item_local_range_0) -# define ccl_gpu_block_idx_x (kg->nd_item_group_0) -# define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0) -# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) -# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) - -# define ccl_gpu_global_id_x() (kg->nd_item_global_id_0) -# define ccl_gpu_global_size_x() (kg->nd_item_global_range_0) -#endif +#define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0)) +#define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0)) +#define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0)) +#define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0)) +#define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) +#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) +#define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0)) +#define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0)) /* GPU warp synchronization */ - #define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier() #define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space) #ifdef __SYCL_DEVICE_ONLY__ diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h index d60f4f135ba..116620eb725 100644 --- a/intern/cycles/kernel/device/oneapi/globals.h +++ b/intern/cycles/kernel/device/oneapi/globals.h @@ -23,15 +23,6 @@ typedef struct KernelGlobalsGPU { #undef KERNEL_DATA_ARRAY IntegratorStateGPU *integrator_state; const KernelData *__data; -#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - size_t nd_item_local_id_0; - size_t nd_item_local_range_0; - size_t nd_item_group_0; - size_t nd_item_group_range_0; - - size_t nd_item_global_id_0; - size_t nd_item_global_range_0; -#endif } KernelGlobalsGPU; typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals; diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp index 1f32d3406ea..525ae288f0c 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.cpp +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -230,13 +230,6 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices, * we extend work size to fit uniformity requirements. */ global_size = groups_count * local_size; - -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - if (queue->get_device().is_host()) { - global_size = 1; - local_size = 1; - } -# endif } /* Let the compiler throw an error if there are any kernels missing in this implementation. */ -- cgit v1.2.3