diff options
author | Xavier Hallade <xavier.hallade@intel.com> | 2022-10-06 19:35:51 +0300 |
---|---|---|
committer | Xavier Hallade <xavier.hallade@intel.com> | 2022-10-07 10:50:05 +0300 |
commit | 7eeeaec6da33971ab7805c9a4bfd5f4e186273d1 (patch) | |
tree | f3090686dab61d3a25d77fdbece49f1bc4c04ca4 /intern/cycles | |
parent | fc0b1627ebb821b1897cbca7f6ba9be29e52359a (diff) |
Cycles: use direct linking for oneAPI backend
This is a minimal set of changes, allowing a lot of cleanup that can
happen afterward as it allows sycl method and objects to be used outside
of kernel.cpp.
Reviewed By: brecht, sergey
Differential Revision: https://developer.blender.org/D15397
Diffstat (limited to 'intern/cycles')
-rw-r--r-- | intern/cycles/device/CMakeLists.txt | 16 | ||||
-rw-r--r-- | intern/cycles/device/oneapi/device.cpp | 65 | ||||
-rw-r--r-- | intern/cycles/device/oneapi/device_impl.cpp | 525 | ||||
-rw-r--r-- | intern/cycles/device/oneapi/device_impl.h | 48 | ||||
-rw-r--r-- | intern/cycles/device/oneapi/dll_interface.h | 17 | ||||
-rw-r--r-- | intern/cycles/device/oneapi/queue.cpp | 11 | ||||
-rw-r--r-- | intern/cycles/device/oneapi/queue.h | 4 | ||||
-rw-r--r-- | intern/cycles/kernel/CMakeLists.txt | 38 | ||||
-rw-r--r-- | intern/cycles/kernel/device/oneapi/dll_interface_template.h | 54 | ||||
-rw-r--r-- | intern/cycles/kernel/device/oneapi/kernel.cpp | 467 | ||||
-rw-r--r-- | intern/cycles/kernel/device/oneapi/kernel.h | 19 |
11 files changed, 561 insertions, 703 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 24855d795d1..e5467121497 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -142,7 +142,6 @@ set(SRC ${SRC_DUMMY} ${SRC_MULTI} ${SRC_OPTIX} - ${SRC_ONEAPI} ${SRC_HEADERS} ) @@ -188,7 +187,22 @@ if(WITH_CYCLES_DEVICE_METAL) ) endif() if (WITH_CYCLES_DEVICE_ONEAPI) + if(WIN32) + set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi.lib) + else() + set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi.so) + endif() + list(APPEND LIB + ${SYCL_LIBRARY} + ${cycles_kernel_oneapi_lib} + ) add_definitions(-DWITH_ONEAPI) + list(APPEND SRC + ${SRC_ONEAPI} + ) + list(APPEND INC_SYS + ${SYCL_INCLUDE_DIR} + ) endif() if(WITH_OPENIMAGEDENOISE) diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp index 4aa307e9300..f303ab41627 100644 --- a/intern/cycles/device/oneapi/device.cpp +++ b/intern/cycles/device/oneapi/device.cpp @@ -19,62 +19,12 @@ CCL_NAMESPACE_BEGIN -#ifdef WITH_ONEAPI -static OneAPIDLLInterface oneapi_dll; -#endif - -#ifdef _WIN32 -# define LOAD_ONEAPI_SHARED_LIBRARY(path) (void *)(LoadLibrary(path)) -# define LOAD_ONEAPI_SHARED_LIBRARY_ERROR() GetLastError() -# define FREE_SHARED_LIBRARY(handle) FreeLibrary((HMODULE)handle) -# define GET_SHARED_LIBRARY_SYMBOL(handle, name) GetProcAddress((HMODULE)handle, name) -#elif __linux__ -# define LOAD_ONEAPI_SHARED_LIBRARY(path) dlopen(path, RTLD_NOW) -# define LOAD_ONEAPI_SHARED_LIBRARY_ERROR() dlerror() -# define FREE_SHARED_LIBRARY(handle) dlclose(handle) -# define GET_SHARED_LIBRARY_SYMBOL(handle, name) dlsym(handle, name) -#endif - bool device_oneapi_init() { #if !defined(WITH_ONEAPI) return false; #else - string lib_path = path_get("lib"); -# ifdef _WIN32 - lib_path = path_join(lib_path, "cycles_kernel_oneapi.dll"); -# else - lib_path = path_join(lib_path, "cycles_kernel_oneapi.so"); -# endif - void *lib_handle = LOAD_ONEAPI_SHARED_LIBRARY(lib_path.c_str()); - - /* This shouldn't happen, but it still makes sense to have a branch for this. */ - if (lib_handle == NULL) { - LOG(ERROR) << "oneAPI kernel shared library cannot be loaded: " - << LOAD_ONEAPI_SHARED_LIBRARY_ERROR(); - return false; - } - -# define DLL_INTERFACE_CALL(function, return_type, ...) \ - (oneapi_dll.function) = reinterpret_cast<decltype(oneapi_dll.function)>( \ - GET_SHARED_LIBRARY_SYMBOL(lib_handle, #function)); \ - if (oneapi_dll.function == NULL) { \ - LOG(ERROR) << "oneAPI shared library function \"" << #function \ - << "\" has not been loaded from kernel shared - disable oneAPI " \ - "library disable oneAPI implementation due to this"; \ - FREE_SHARED_LIBRARY(lib_handle); \ - return false; \ - } -# include "kernel/device/oneapi/dll_interface_template.h" -# undef DLL_INTERFACE_CALL - - VLOG_INFO << "oneAPI kernel shared library has been loaded successfully"; - - /* We need to have this oneapi kernel shared library during all life-span of the Blender. - * So it is not unloaded because of this. - * FREE_SHARED_LIBRARY(lib_handle); */ - /* NOTE(@nsirgien): we need to enable JIT cache from here and * right now this cache policy is controlled by env. variables. */ /* NOTE(hallade) we also disable use of copy engine as it @@ -109,17 +59,10 @@ bool device_oneapi_init() #endif } -#if defined(_WIN32) || defined(__linux__) -# undef LOAD_SYCL_SHARED_LIBRARY -# undef LOAD_ONEAPI_SHARED_LIBRARY -# undef FREE_SHARED_LIBRARY -# undef GET_SHARED_LIBRARY_SYMBOL -#endif - Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) { #ifdef WITH_ONEAPI - return new OneapiDevice(info, oneapi_dll, stats, profiler); + return new OneapiDevice(info, stats, profiler); #else (void)info; (void)stats; @@ -165,7 +108,7 @@ static void device_iterator_cb(const char *id, const char *name, int num, void * void device_oneapi_info(vector<DeviceInfo> &devices) { #ifdef WITH_ONEAPI - (oneapi_dll.oneapi_iterate_devices)(device_iterator_cb, &devices); + OneapiDevice::iterate_devices(device_iterator_cb, &devices); #else /* WITH_ONEAPI */ (void)devices; #endif /* WITH_ONEAPI */ @@ -175,10 +118,10 @@ string device_oneapi_capabilities() { string capabilities; #ifdef WITH_ONEAPI - char *c_capabilities = (oneapi_dll.oneapi_device_capabilities)(); + char *c_capabilities = OneapiDevice::device_capabilities(); if (c_capabilities) { capabilities = c_capabilities; - (oneapi_dll.oneapi_free)(c_capabilities); + free(c_capabilities); } #endif return capabilities; diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp index dd0622a5bd5..2df605fa047 100644 --- a/intern/cycles/device/oneapi/device_impl.cpp +++ b/intern/cycles/device/oneapi/device_impl.cpp @@ -8,7 +8,7 @@ # include "util/debug.h" # include "util/log.h" -# include "kernel/device/oneapi/kernel.h" +# include "kernel/device/oneapi/globals.h" CCL_NAMESPACE_BEGIN @@ -19,26 +19,19 @@ static void queue_error_cb(const char *message, void *user_ptr) } } -OneapiDevice::OneapiDevice(const DeviceInfo &info, - OneAPIDLLInterface &oneapi_dll_object, - Stats &stats, - Profiler &profiler) +OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) : Device(info, stats, profiler), device_queue_(nullptr), texture_info_(this, "texture_info", MEM_GLOBAL), kg_memory_(nullptr), kg_memory_device_(nullptr), - kg_memory_size_(0), - oneapi_dll_(oneapi_dll_object) + kg_memory_size_(0) { need_texture_info_ = false; - oneapi_dll_.oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_); + oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_); - /* OneAPI calls should be initialized on this moment. */ - assert(oneapi_dll_.oneapi_create_queue != nullptr); - - bool is_finished_ok = oneapi_dll_.oneapi_create_queue(device_queue_, info.num); + bool is_finished_ok = create_queue(device_queue_, info.num); if (is_finished_ok == false) { set_error("oneAPI queue initialization error: got runtime exception \"" + oneapi_error_string_ + "\""); @@ -50,7 +43,7 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, } size_t globals_segment_size; - is_finished_ok = oneapi_dll_.oneapi_kernel_globals_size(device_queue_, globals_segment_size); + is_finished_ok = kernel_globals_size(device_queue_, globals_segment_size); if (is_finished_ok == false) { set_error("oneAPI constant memory initialization got runtime exception \"" + oneapi_error_string_ + "\""); @@ -59,27 +52,27 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, VLOG_DEBUG << "Successfully created global/constant memory segment (kernel globals object)"; } - kg_memory_ = oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, globals_segment_size, 16); - oneapi_dll_.oneapi_usm_memset(device_queue_, kg_memory_, 0, globals_segment_size); + kg_memory_ = usm_aligned_alloc_host(device_queue_, globals_segment_size, 16); + usm_memset(device_queue_, kg_memory_, 0, globals_segment_size); - kg_memory_device_ = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, globals_segment_size); + kg_memory_device_ = usm_alloc_device(device_queue_, globals_segment_size); kg_memory_size_ = globals_segment_size; - max_memory_on_device_ = oneapi_dll_.oneapi_get_memcapacity(device_queue_); + max_memory_on_device_ = get_memcapacity(); } OneapiDevice::~OneapiDevice() { texture_info_.free(); - oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_); - oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_device_); + usm_free(device_queue_, kg_memory_); + usm_free(device_queue_, kg_memory_device_); for (ConstMemMap::iterator mt = const_mem_map_.begin(); mt != const_mem_map_.end(); mt++) delete mt->second; if (device_queue_) - oneapi_dll_.oneapi_free_queue(device_queue_); + free_queue(device_queue_); } bool OneapiDevice::check_peer_access(Device * /*peer_device*/) @@ -99,7 +92,7 @@ bool OneapiDevice::load_kernels(const uint requested_features) * with specialization constants, but it hasn't been implemented yet. */ (void)requested_features; - bool is_finished_ok = oneapi_dll_.oneapi_run_test_kernel(device_queue_); + bool is_finished_ok = oneapi_run_test_kernel(device_queue_); if (is_finished_ok == false) { set_error("oneAPI kernel load: got runtime exception \"" + oneapi_error_string_ + "\""); } @@ -138,7 +131,7 @@ void OneapiDevice::generic_alloc(device_memory &mem) * type has been used for oneAPI device in order to better fit in Cycles architecture. */ void *device_pointer = nullptr; if (mem.memory_size() + stats.mem_used < max_memory_on_device_) - device_pointer = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, memory_size); + device_pointer = usm_alloc_device(device_queue_, memory_size); if (device_pointer == nullptr) { set_error("oneAPI kernel - device memory allocation error for " + string_human_readable_size(mem.memory_size()) + @@ -163,8 +156,7 @@ void OneapiDevice::generic_copy_to(device_memory &mem) /* Copy operation from host shouldn't be requested if there is no memory allocated on host. */ assert(mem.host_pointer); assert(device_queue_); - oneapi_dll_.oneapi_usm_memcpy( - device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size); + usm_memcpy(device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size); } /* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */ @@ -178,11 +170,6 @@ string OneapiDevice::oneapi_error_message() return string(oneapi_error_string_); } -OneAPIDLLInterface OneapiDevice::oneapi_dll_object() -{ - return oneapi_dll_; -} - void *OneapiDevice::kernel_globals_device_pointer() { return kg_memory_device_; @@ -198,7 +185,7 @@ void OneapiDevice::generic_free(device_memory &mem) mem.device_size = 0; assert(device_queue_); - oneapi_dll_.oneapi_usm_free(device_queue_, (void *)mem.device_pointer); + usm_free(device_queue_, (void *)mem.device_pointer); mem.device_pointer = 0; } @@ -266,8 +253,7 @@ void OneapiDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t if (mem.device_pointer) { char *shifted_host = reinterpret_cast<char *>(mem.host_pointer) + offset; char *shifted_device = reinterpret_cast<char *>(mem.device_pointer) + offset; - bool is_finished_ok = oneapi_dll_.oneapi_usm_memcpy( - device_queue_, shifted_host, shifted_device, size); + bool is_finished_ok = usm_memcpy(device_queue_, shifted_host, shifted_device, size); if (is_finished_ok == false) { set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ + "\""); @@ -292,7 +278,7 @@ void OneapiDevice::mem_zero(device_memory &mem) } assert(device_queue_); - bool is_finished_ok = oneapi_dll_.oneapi_usm_memset( + bool is_finished_ok = usm_memset( device_queue_, (void *)mem.device_pointer, 0, mem.memory_size()); if (is_finished_ok == false) { set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ + @@ -349,10 +335,9 @@ void OneapiDevice::const_copy_to(const char *name, void *host, size_t size) memcpy(data->data(), host, size); data->copy_to_device(); - oneapi_dll_.oneapi_set_global_memory( - device_queue_, kg_memory_, name, (void *)data->device_pointer); + set_global_memory(device_queue_, kg_memory_, name, (void *)data->device_pointer); - oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_); + usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_); } void OneapiDevice::global_alloc(device_memory &mem) @@ -367,10 +352,9 @@ void OneapiDevice::global_alloc(device_memory &mem) generic_alloc(mem); generic_copy_to(mem); - oneapi_dll_.oneapi_set_global_memory( - device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer); + set_global_memory(device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer); - oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_); + usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_); } void OneapiDevice::global_free(device_memory &mem) @@ -410,18 +394,6 @@ unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create() return make_unique<OneapiDeviceQueue>(this); } -int OneapiDevice::get_num_multiprocessors() -{ - assert(device_queue_); - return oneapi_dll_.oneapi_get_num_multiprocessors(device_queue_); -} - -int OneapiDevice::get_max_num_threads_per_multiprocessor() -{ - assert(device_queue_); - return oneapi_dll_.oneapi_get_max_num_threads_per_multiprocessor(device_queue_); -} - bool OneapiDevice::should_use_graphics_interop() { /* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so @@ -432,13 +404,460 @@ bool OneapiDevice::should_use_graphics_interop() void *OneapiDevice::usm_aligned_alloc_host(size_t memory_size, size_t alignment) { assert(device_queue_); - return oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, memory_size, alignment); + return usm_aligned_alloc_host(device_queue_, memory_size, alignment); } void OneapiDevice::usm_free(void *usm_ptr) { assert(device_queue_); - return oneapi_dll_.oneapi_usm_free(device_queue_, usm_ptr); + return usm_free(device_queue_, usm_ptr); +} + +void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false) +{ +# ifdef _DEBUG + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + sycl::info::device_type device_type = + queue->get_device().get_info<sycl::info::device::device_type>(); + sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context()); + (void)usm_type; + assert(usm_type == sycl::usm::alloc::device || + ((device_type == sycl::info::device_type::host || + device_type == sycl::info::device_type::cpu || allow_host) && + usm_type == sycl::usm::alloc::host)); +# endif +} + +bool OneapiDevice::create_queue(SyclQueue *&external_queue, int device_index) +{ + bool finished_correct = true; + try { + std::vector<sycl::device> devices = OneapiDevice::available_devices(); + if (device_index < 0 || device_index >= devices.size()) { + return false; + } + sycl::queue *created_queue = new sycl::queue(devices[device_index], + sycl::property::queue::in_order()); + external_queue = reinterpret_cast<SyclQueue *>(created_queue); + } + catch (sycl::exception const &e) { + finished_correct = false; + oneapi_error_string_ = e.what(); + } + return finished_correct; +} + +void OneapiDevice::free_queue(SyclQueue *queue_) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + delete queue; +} + +void *OneapiDevice::usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + return sycl::aligned_alloc_host(alignment, memory_size, *queue); +} + +void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + return sycl::malloc_device(memory_size, *queue); +} + +void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + OneapiDevice::check_usm(queue_, usm_ptr, true); + sycl::free(usm_ptr, *queue); +} + +bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + OneapiDevice::check_usm(queue_, dest, true); + OneapiDevice::check_usm(queue_, src, true); + sycl::event mem_event = queue->memcpy(dest, src, num_bytes); +# ifdef WITH_CYCLES_DEBUG + try { + /* NOTE(@nsirgien) Waiting on memory operation may give more precise error + * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug. + */ + mem_event.wait_and_throw(); + return true; + } + catch (sycl::exception const &e) { + oneapi_error_string_ = e.what(); + return false; + } +# else + sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context()); + sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context()); + bool from_device_to_host = dest_type == sycl::usm::alloc::host && + src_type == sycl::usm::alloc::device; + bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown || + src_type == sycl::usm::alloc::unknown; + /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host + * may not wait until the end of the transfer before using the memory. + */ + if (from_device_to_host || host_or_device_memop_with_offset) + mem_event.wait(); + return true; +# endif +} + +bool OneapiDevice::usm_memset(SyclQueue *queue_, + void *usm_ptr, + unsigned char value, + size_t num_bytes) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + OneapiDevice::check_usm(queue_, usm_ptr, true); + sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes); +# ifdef WITH_CYCLES_DEBUG + try { + /* NOTE(@nsirgien) Waiting on memory operation may give more precise error + * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug. + */ + mem_event.wait_and_throw(); + return true; + } + catch (sycl::exception const &e) { + oneapi_error_string_ = e.what(); + return false; + } +# else + (void)mem_event; + return true; +# endif +} + +bool OneapiDevice::queue_synchronize(SyclQueue *queue_) +{ + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + try { + queue->wait_and_throw(); + return true; + } + catch (sycl::exception const &e) { + oneapi_error_string_ = e.what(); + return false; + } +} + +bool OneapiDevice::kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size) +{ + kernel_global_size = sizeof(KernelGlobalsGPU); + + return true; +} + +void OneapiDevice::set_global_memory(SyclQueue *queue_, + void *kernel_globals, + const char *memory_name, + void *memory_device_pointer) +{ + assert(queue_); + assert(kernel_globals); + assert(memory_name); + assert(memory_device_pointer); + KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals; + OneapiDevice::check_usm(queue_, memory_device_pointer); + OneapiDevice::check_usm(queue_, kernel_globals, true); + + std::string matched_name(memory_name); + +/* This macro will change global ptr of KernelGlobals via name matching. */ +# define KERNEL_DATA_ARRAY(type, name) \ + else if (#name == matched_name) \ + { \ + globals->__##name = (type *)memory_device_pointer; \ + return; \ + } + if (false) { + } + else if ("integrator_state" == matched_name) { + globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer; + return; + } + KERNEL_DATA_ARRAY(KernelData, data) +# include "kernel/data_arrays.h" + else + { + std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!" + << std::endl; + assert(false); + } +# undef KERNEL_DATA_ARRAY +} + +bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context, + int kernel, + size_t global_size, + void **args) +{ + return oneapi_enqueue_kernel(kernel_context, kernel, global_size, args); +} + +/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows + * since Windows driver 101.3268. */ +/* The same min compute-runtime version is currently required across Windows and Linux. + * For Windows driver 101.3430, compute-runtime version is 23904. */ +static const int lowest_supported_driver_version_win = 1013430; +static const int lowest_supported_driver_version_neo = 23904; + +int OneapiDevice::parse_driver_build_version(const sycl::device &device) +{ + const std::string &driver_version = device.get_info<sycl::info::device::driver_version>(); + int driver_build_version = 0; + + size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1); + if (second_dot_position == std::string::npos) { + std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version + << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," + << " xx.xx.xxx.xxxx (Windows) for device \"" + << device.get_info<sycl::info::device::name>() << "\"." << std::endl; + } + else { + try { + size_t third_dot_position = driver_version.find('.', second_dot_position + 1); + if (third_dot_position != std::string::npos) { + const std::string &third_number_substr = driver_version.substr( + second_dot_position + 1, third_dot_position - second_dot_position - 1); + const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1); + if (third_number_substr.length() == 3 && forth_number_substr.length() == 4) + driver_build_version = std::stoi(third_number_substr) * 10000 + + std::stoi(forth_number_substr); + } + else { + const std::string &third_number_substr = driver_version.substr(second_dot_position + 1); + driver_build_version = std::stoi(third_number_substr); + } + } + catch (std::invalid_argument &) { + std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version + << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," + << " xx.xx.xxx.xxxx (Windows) for device \"" + << device.get_info<sycl::info::device::name>() << "\"." << std::endl; + } + } + + return driver_build_version; +} + +std::vector<sycl::device> OneapiDevice::available_devices() +{ + bool allow_all_devices = false; + if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) + allow_all_devices = true; + + /* Host device is useful only for debugging at the moment + * so we hide this device with default build settings. */ +# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED + bool allow_host = true; +# else + bool allow_host = false; +# endif + + const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms(); + + std::vector<sycl::device> available_devices; + for (const sycl::platform &platform : oneapi_platforms) { + /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL. + */ + if (platform.get_backend() == sycl::backend::opencl) { + continue; + } + + const std::vector<sycl::device> &oneapi_devices = + (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) : + platform.get_devices(sycl::info::device_type::gpu); + + for (const sycl::device &device : oneapi_devices) { + if (allow_all_devices) { + /* still filter out host device if build doesn't support it. */ + if (allow_host || !device.is_host()) { + available_devices.push_back(device); + } + } + else { + bool filter_out = false; + + /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU, + * assuming they have either more than 96 Execution Units or not 7 threads per EU. + * Official support can be broaden to older and smaller GPUs once ready. */ + if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) { + /* Filtered-out defaults in-case these values aren't available through too old L0 + * runtime. */ + int number_of_eus = 96; + int threads_per_eu = 7; + if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) { + number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>(); + } + if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { + threads_per_eu = + device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>(); + } + /* This filters out all Level-Zero supported GPUs from older generation than Arc. */ + if (number_of_eus <= 96 && threads_per_eu == 7) { + filter_out = true; + } + /* if not already filtered out, check driver version. */ + if (!filter_out) { + int driver_build_version = parse_driver_build_version(device); + if ((driver_build_version > 100000 && + driver_build_version < lowest_supported_driver_version_win) || + driver_build_version < lowest_supported_driver_version_neo) { + filter_out = true; + } + } + } + else if (!allow_host && device.is_host()) { + filter_out = true; + } + else if (!allow_all_devices) { + filter_out = true; + } + + if (!filter_out) { + available_devices.push_back(device); + } + } + } + } + + return available_devices; +} + +char *OneapiDevice::device_capabilities() +{ + std::stringstream capabilities; + + const std::vector<sycl::device> &oneapi_devices = available_devices(); + for (const sycl::device &device : oneapi_devices) { + const std::string &name = device.get_info<sycl::info::device::name>(); + + capabilities << std::string("\t") << name << "\n"; +# define WRITE_ATTR(attribute_name, attribute_variable) \ + capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \ + << "\n"; +# define GET_NUM_ATTR(attribute) \ + { \ + size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \ + capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \ + } + + GET_NUM_ATTR(vendor_id) + GET_NUM_ATTR(max_compute_units) + GET_NUM_ATTR(max_work_item_dimensions) + + sycl::id<3> max_work_item_sizes = + device.get_info<sycl::info::device::max_work_item_sizes<3>>(); + WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0))) + WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1))) + WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2))) + + GET_NUM_ATTR(max_work_group_size) + GET_NUM_ATTR(max_num_sub_groups) + GET_NUM_ATTR(sub_group_independent_forward_progress) + + GET_NUM_ATTR(preferred_vector_width_char) + GET_NUM_ATTR(preferred_vector_width_short) + GET_NUM_ATTR(preferred_vector_width_int) + GET_NUM_ATTR(preferred_vector_width_long) + GET_NUM_ATTR(preferred_vector_width_float) + GET_NUM_ATTR(preferred_vector_width_double) + GET_NUM_ATTR(preferred_vector_width_half) + + GET_NUM_ATTR(native_vector_width_char) + GET_NUM_ATTR(native_vector_width_short) + GET_NUM_ATTR(native_vector_width_int) + GET_NUM_ATTR(native_vector_width_long) + GET_NUM_ATTR(native_vector_width_float) + GET_NUM_ATTR(native_vector_width_double) + GET_NUM_ATTR(native_vector_width_half) + + size_t max_clock_frequency = + (size_t)(device.is_host() ? (size_t)0 : + device.get_info<sycl::info::device::max_clock_frequency>()); + WRITE_ATTR("max_clock_frequency", max_clock_frequency) + + GET_NUM_ATTR(address_bits) + GET_NUM_ATTR(max_mem_alloc_size) + + /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't + * supported so we always return false, even if device supports HW texture usage acceleration. + */ + bool image_support = false; + WRITE_ATTR("image_support", (size_t)image_support) + + GET_NUM_ATTR(max_parameter_size) + GET_NUM_ATTR(mem_base_addr_align) + GET_NUM_ATTR(global_mem_size) + GET_NUM_ATTR(local_mem_size) + GET_NUM_ATTR(error_correction_support) + GET_NUM_ATTR(profiling_timer_resolution) + GET_NUM_ATTR(is_available) + +# undef GET_NUM_ATTR +# undef WRITE_ATTR + capabilities << "\n"; + } + + return ::strdup(capabilities.str().c_str()); +} + +void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr) +{ + int num = 0; + std::vector<sycl::device> devices = OneapiDevice::available_devices(); + for (sycl::device &device : devices) { + const std::string &platform_name = + device.get_platform().get_info<sycl::info::platform::name>(); + std::string name = device.get_info<sycl::info::device::name>(); + std::string id = "ONEAPI_" + platform_name + "_" + name; + if (device.has(sycl::aspect::ext_intel_pci_address)) { + id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>()); + } + (cb)(id.c_str(), name.c_str(), num, user_ptr); + num++; + } +} + +size_t OneapiDevice::get_memcapacity() +{ + return reinterpret_cast<sycl::queue *>(device_queue_) + ->get_device() + .get_info<sycl::info::device::global_mem_size>(); +} + +int OneapiDevice::get_num_multiprocessors() +{ + const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device(); + if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) { + return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>(); + } + else + return 0; +} + +int OneapiDevice::get_max_num_threads_per_multiprocessor() +{ + const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device(); + if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) && + device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { + return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() * + device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>(); + } + else + return 0; } CCL_NAMESPACE_END diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h index 6abebf98684..3589e881a6e 100644 --- a/intern/cycles/device/oneapi/device_impl.h +++ b/intern/cycles/device/oneapi/device_impl.h @@ -3,9 +3,12 @@ #ifdef WITH_ONEAPI +# include <CL/sycl.hpp> + # include "device/device.h" # include "device/oneapi/device.h" # include "device/oneapi/queue.h" +# include "kernel/device/oneapi/kernel.h" # include "util/map.h" @@ -13,6 +16,11 @@ CCL_NAMESPACE_BEGIN class DeviceQueue; +typedef void (*OneAPIDeviceIteratorCallback)(const char *id, + const char *name, + int num, + void *user_ptr); + class OneapiDevice : public Device { private: SyclQueue *device_queue_; @@ -25,16 +33,12 @@ class OneapiDevice : public Device { void *kg_memory_device_; size_t kg_memory_size_ = (size_t)0; size_t max_memory_on_device_ = (size_t)0; - OneAPIDLLInterface oneapi_dll_; std::string oneapi_error_string_; public: virtual BVHLayoutMask get_bvh_layout_mask() const override; - OneapiDevice(const DeviceInfo &info, - OneAPIDLLInterface &oneapi_dll_object, - Stats &stats, - Profiler &profiler); + OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler); virtual ~OneapiDevice(); @@ -50,12 +54,8 @@ class OneapiDevice : public Device { void generic_free(device_memory &mem); - SyclQueue *sycl_queue(); - string oneapi_error_message(); - OneAPIDLLInterface oneapi_dll_object(); - void *kernel_globals_device_pointer(); void mem_alloc(device_memory &mem) override; @@ -90,13 +90,37 @@ class OneapiDevice : public Device { virtual unique_ptr<DeviceQueue> gpu_queue_create() override; - int get_num_multiprocessors(); - int get_max_num_threads_per_multiprocessor(); - /* NOTE(@nsirgien): Create this methods to avoid some compilation problems on Windows with host * side compilation (MSVC). */ void *usm_aligned_alloc_host(size_t memory_size, size_t alignment); void usm_free(void *usm_ptr); + + static std::vector<sycl::device> available_devices(); + static char *device_capabilities(); + static int parse_driver_build_version(const sycl::device &device); + static void iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr); + + size_t get_memcapacity(); + int get_num_multiprocessors(); + int get_max_num_threads_per_multiprocessor(); + bool queue_synchronize(SyclQueue *queue); + bool kernel_globals_size(SyclQueue *queue, size_t &kernel_global_size); + void set_global_memory(SyclQueue *queue, + void *kernel_globals, + const char *memory_name, + void *memory_device_pointer); + bool enqueue_kernel(KernelContext *kernel_context, int kernel, size_t global_size, void **args); + SyclQueue *sycl_queue(); + + protected: + void check_usm(SyclQueue *queue, const void *usm_ptr, bool allow_host); + bool create_queue(SyclQueue *&external_queue, int device_index); + void free_queue(SyclQueue *queue); + void *usm_aligned_alloc_host(SyclQueue *queue, size_t memory_size, size_t alignment); + void *usm_alloc_device(SyclQueue *queue, size_t memory_size); + void usm_free(SyclQueue *queue, void *usm_ptr); + bool usm_memcpy(SyclQueue *queue, void *dest, void *src, size_t num_bytes); + bool usm_memset(SyclQueue *queue, void *usm_ptr, unsigned char value, size_t num_bytes); }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/oneapi/dll_interface.h b/intern/cycles/device/oneapi/dll_interface.h deleted file mode 100644 index 0a888194e98..00000000000 --- a/intern/cycles/device/oneapi/dll_interface.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2011-2022 Blender Foundation */ - -#pragma once - -/* Include kernel header to get access to SYCL-specific types, like SyclQueue and - * OneAPIDeviceIteratorCallback. */ -#include "kernel/device/oneapi/kernel.h" - -#ifdef WITH_ONEAPI -struct OneAPIDLLInterface { -# define DLL_INTERFACE_CALL(function, return_type, ...) \ - return_type (*function)(__VA_ARGS__) = nullptr; -# include "kernel/device/oneapi/dll_interface_template.h" -# undef DLL_INTERFACE_CALL -}; -#endif diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp index f865b27f976..9632b14d485 100644 --- a/intern/cycles/device/oneapi/queue.cpp +++ b/intern/cycles/device/oneapi/queue.cpp @@ -22,10 +22,7 @@ struct KernelExecutionInfo { /* OneapiDeviceQueue */ OneapiDeviceQueue::OneapiDeviceQueue(OneapiDevice *device) - : DeviceQueue(device), - oneapi_device_(device), - oneapi_dll_(device->oneapi_dll_object()), - kernel_context_(nullptr) + : DeviceQueue(device), oneapi_device_(device), kernel_context_(nullptr) { } @@ -81,14 +78,14 @@ bool OneapiDeviceQueue::enqueue(DeviceKernel kernel, assert(signed_kernel_work_size >= 0); size_t kernel_work_size = (size_t)signed_kernel_work_size; - size_t kernel_local_size = oneapi_dll_.oneapi_kernel_preferred_local_size( + size_t kernel_local_size = oneapi_kernel_preferred_local_size( kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size); size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size); assert(kernel_context_); /* Call the oneAPI kernel DLL to launch the requested kernel. */ - bool is_finished_ok = oneapi_dll_.oneapi_enqueue_kernel( + bool is_finished_ok = oneapi_device_->enqueue_kernel( kernel_context_, kernel, uniformed_kernel_work_size, args); if (is_finished_ok == false) { @@ -108,7 +105,7 @@ bool OneapiDeviceQueue::synchronize() return false; } - bool is_finished_ok = oneapi_dll_.oneapi_queue_synchronize(oneapi_device_->sycl_queue()); + bool is_finished_ok = oneapi_device_->queue_synchronize(oneapi_device_->sycl_queue()); if (is_finished_ok == false) oneapi_device_->set_error("oneAPI unknown kernel execution error: got runtime exception \"" + oneapi_device_->oneapi_error_message() + "\""); diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h index 716cbfdc88c..32363bf2a6e 100644 --- a/intern/cycles/device/oneapi/queue.h +++ b/intern/cycles/device/oneapi/queue.h @@ -10,7 +10,7 @@ # include "device/queue.h" # include "device/oneapi/device.h" -# include "device/oneapi/dll_interface.h" +# include "kernel/device/oneapi/kernel.h" CCL_NAMESPACE_BEGIN @@ -41,9 +41,7 @@ class OneapiDeviceQueue : public DeviceQueue { protected: OneapiDevice *oneapi_device_; - OneAPIDLLInterface oneapi_dll_; KernelContext *kernel_context_; - bool with_kernel_statistics_; }; CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 9cf9b761651..bbf8fb8682b 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -716,7 +716,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI) if(WIN32) set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.dll) else() - set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.so) + set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/libcycles_kernel_oneapi.so) endif() set(cycles_oneapi_kernel_sources @@ -815,6 +815,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI) if(WIN32) list(APPEND sycl_compiler_flags + -fuse-ld=link -fms-extensions -fms-compatibility -D_WINDLL @@ -888,33 +889,24 @@ if(WITH_CYCLES_DEVICE_ONEAPI) endif() endif() + if(NOT WITH_BLENDER) + # For the Cycles standalone put libraries next to the Cycles application. + set(cycles_oneapi_target_path ${CYCLES_INSTALL_PATH}) + else() + # For Blender put the libraries next to the Blender executable. + # + # Note that the installation path in the delayed_install is relative to the versioned folder, + # which means we need to go one level up. + set(cycles_oneapi_target_path "../") + endif() + # install dynamic libraries required at runtime if(WIN32) - set(SYCL_RUNTIME_DEPENDENCIES - sycl.dll - pi_level_zero.dll - ) - if(NOT WITH_BLENDER) - # For the Cycles standalone put libraries next to the Cycles application. - delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH}) - else() - # For Blender put the libraries next to the Blender executable. - # - # Note that the installation path in the delayed_install is relative to the versioned folder, - # which means we need to go one level up. - delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" "../") - endif() + delayed_install("" "${cycles_kernel_oneapi_lib}" ${cycles_oneapi_target_path}) elseif(UNIX AND NOT APPLE) - file(GLOB SYCL_RUNTIME_DEPENDENCIES - ${sycl_compiler_root}/../lib/libsycl.so - ${sycl_compiler_root}/../lib/libsycl.so.[0-9] - ${sycl_compiler_root}/../lib/libsycl.so.[0-9].[0-9].[0-9]-[0-9] - ) - list(APPEND SYCL_RUNTIME_DEPENDENCIES ${sycl_compiler_root}/../lib/libpi_level_zero.so) - delayed_install("" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH}/lib) + delayed_install("" "${cycles_kernel_oneapi_lib}" ${cycles_oneapi_target_path}/lib) endif() - delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cycles_kernel_oneapi_lib}" ${CYCLES_INSTALL_PATH}/lib) add_custom_target(cycles_kernel_oneapi ALL DEPENDS ${cycles_kernel_oneapi_lib}) endif() diff --git a/intern/cycles/kernel/device/oneapi/dll_interface_template.h b/intern/cycles/kernel/device/oneapi/dll_interface_template.h deleted file mode 100644 index 5dd0d4203a4..00000000000 --- a/intern/cycles/kernel/device/oneapi/dll_interface_template.h +++ /dev/null @@ -1,54 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2022 Intel Corporation */ - -/* device_capabilities() returns a C string that must be free'd with oneapi_free(). */ -DLL_INTERFACE_CALL(oneapi_device_capabilities, char *) -DLL_INTERFACE_CALL(oneapi_free, void, void *) -DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue) - -DLL_INTERFACE_CALL(oneapi_get_num_multiprocessors, int, SyclQueue *queue) -DLL_INTERFACE_CALL(oneapi_get_max_num_threads_per_multiprocessor, int, SyclQueue *queue) -DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr) -DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr) - -DLL_INTERFACE_CALL(oneapi_create_queue, bool, SyclQueue *&external_queue, int device_index) -DLL_INTERFACE_CALL(oneapi_free_queue, void, SyclQueue *queue) -DLL_INTERFACE_CALL( - oneapi_usm_aligned_alloc_host, void *, SyclQueue *queue, size_t memory_size, size_t alignment) -DLL_INTERFACE_CALL(oneapi_usm_alloc_device, void *, SyclQueue *queue, size_t memory_size) -DLL_INTERFACE_CALL(oneapi_usm_free, void, SyclQueue *queue, void *usm_ptr) - -DLL_INTERFACE_CALL( - oneapi_usm_memcpy, bool, SyclQueue *queue, void *dest, void *src, size_t num_bytes) -DLL_INTERFACE_CALL(oneapi_queue_synchronize, bool, SyclQueue *queue) -DLL_INTERFACE_CALL(oneapi_usm_memset, - bool, - SyclQueue *queue, - void *usm_ptr, - unsigned char value, - size_t num_bytes) - -DLL_INTERFACE_CALL(oneapi_run_test_kernel, bool, SyclQueue *queue) - -/* Operation with Kernel globals structure - map of global/constant allocation - filled before - * render/kernel execution As we don't know in cycles `sizeof` this - Cycles will manage just as - * pointer. */ -DLL_INTERFACE_CALL(oneapi_kernel_globals_size, bool, SyclQueue *queue, size_t &kernel_global_size) -DLL_INTERFACE_CALL(oneapi_set_global_memory, - void, - SyclQueue *queue, - void *kernel_globals, - const char *memory_name, - void *memory_device_pointer) - -DLL_INTERFACE_CALL(oneapi_kernel_preferred_local_size, - size_t, - SyclQueue *queue, - const DeviceKernel kernel, - const size_t kernel_global_size) -DLL_INTERFACE_CALL(oneapi_enqueue_kernel, - bool, - KernelContext *context, - int kernel, - size_t global_size, - void **args) diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp index 3c7a9960588..1d1700f036d 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.cpp +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -3,7 +3,6 @@ #ifdef WITH_ONEAPI -/* clang-format off */ # include "kernel.h" # include <iostream> # include <map> @@ -16,163 +15,16 @@ # include "kernel/device/oneapi/kernel_templates.h" # include "kernel/device/gpu/kernel.h" -/* clang-format on */ static OneAPIErrorCallback s_error_cb = nullptr; static void *s_error_user_ptr = nullptr; -static std::vector<sycl::device> oneapi_available_devices(); - void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr) { s_error_cb = cb; s_error_user_ptr = user_ptr; } -void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false) -{ -# ifdef _DEBUG - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - sycl::info::device_type device_type = - queue->get_device().get_info<sycl::info::device::device_type>(); - sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context()); - (void)usm_type; - assert(usm_type == sycl::usm::alloc::device || - ((device_type == sycl::info::device_type::host || - device_type == sycl::info::device_type::is_cpu || allow_host) && - usm_type == sycl::usm::alloc::host)); -# endif -} - -bool oneapi_create_queue(SyclQueue *&external_queue, int device_index) -{ - bool finished_correct = true; - try { - std::vector<sycl::device> devices = oneapi_available_devices(); - if (device_index < 0 || device_index >= devices.size()) { - return false; - } - sycl::queue *created_queue = new sycl::queue(devices[device_index], - sycl::property::queue::in_order()); - external_queue = reinterpret_cast<SyclQueue *>(created_queue); - } - catch (sycl::exception const &e) { - finished_correct = false; - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); - } - } - return finished_correct; -} - -void oneapi_free_queue(SyclQueue *queue_) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - delete queue; -} - -void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - return sycl::aligned_alloc_host(alignment, memory_size, *queue); -} - -void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - return sycl::malloc_device(memory_size, *queue); -} - -void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - oneapi_check_usm(queue_, usm_ptr, true); - sycl::free(usm_ptr, *queue); -} - -bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - oneapi_check_usm(queue_, dest, true); - oneapi_check_usm(queue_, src, true); - sycl::event mem_event = queue->memcpy(dest, src, num_bytes); -# ifdef WITH_CYCLES_DEBUG - try { - /* NOTE(@nsirgien) Waiting on memory operation may give more precise error - * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug. - */ - mem_event.wait_and_throw(); - return true; - } - catch (sycl::exception const &e) { - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); - } - return false; - } -# else - sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context()); - sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context()); - bool from_device_to_host = dest_type == sycl::usm::alloc::host && - src_type == sycl::usm::alloc::device; - bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown || - src_type == sycl::usm::alloc::unknown; - /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host - * may not wait until the end of the transfer before using the memory. - */ - if (from_device_to_host || host_or_device_memop_with_offset) - mem_event.wait(); - return true; -# endif -} - -bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - oneapi_check_usm(queue_, usm_ptr, true); - sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes); -# ifdef WITH_CYCLES_DEBUG - try { - /* NOTE(@nsirgien) Waiting on memory operation may give more precise error - * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug. - */ - mem_event.wait_and_throw(); - return true; - } - catch (sycl::exception const &e) { - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); - } - return false; - } -# else - (void)mem_event; - return true; -# endif -} - -bool oneapi_queue_synchronize(SyclQueue *queue_) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - try { - queue->wait_and_throw(); - return true; - } - catch (sycl::exception const &e) { - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); - } - return false; - } -} - /* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and * also trigger runtime compilation of all existing oneAPI kernels */ bool oneapi_run_test_kernel(SyclQueue *queue_) @@ -216,60 +68,13 @@ bool oneapi_run_test_kernel(SyclQueue *queue_) return true; } -bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size) -{ - kernel_global_size = sizeof(KernelGlobalsGPU); - - return true; -} - -void oneapi_set_global_memory(SyclQueue *queue_, - void *kernel_globals, - const char *memory_name, - void *memory_device_pointer) -{ - assert(queue_); - assert(kernel_globals); - assert(memory_name); - assert(memory_device_pointer); - KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals; - oneapi_check_usm(queue_, memory_device_pointer); - oneapi_check_usm(queue_, kernel_globals, true); - - std::string matched_name(memory_name); - -/* This macro will change global ptr of KernelGlobals via name matching. */ -# define KERNEL_DATA_ARRAY(type, name) \ - else if (#name == matched_name) \ - { \ - globals->__##name = (type *)memory_device_pointer; \ - return; \ - } - if (false) { - } - else if ("integrator_state" == matched_name) { - globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer; - return; - } - KERNEL_DATA_ARRAY(KernelData, data) -# include "kernel/data_arrays.h" - else - { - std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!" - << std::endl; - assert(false); - } -# undef KERNEL_DATA_ARRAY -} - /* TODO: Move device information to OneapiDevice initialized on creation and use it. */ /* TODO: Move below function to oneapi/queue.cpp. */ -size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_, +size_t oneapi_kernel_preferred_local_size(SyclQueue *queue, const DeviceKernel kernel, const size_t kernel_global_size) { - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + assert(queue); (void)kernel_global_size; const static size_t preferred_work_group_size_intersect_shading = 32; const static size_t preferred_work_group_size_technical = 1024; @@ -311,8 +116,10 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_, preferred_work_group_size = 512; } - const size_t limit_work_group_size = - queue->get_device().get_info<sycl::info::device::max_work_group_size>(); + const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue) + ->get_device() + .get_info<sycl::info::device::max_work_group_size>(); + return std::min(limit_work_group_size, preferred_work_group_size); } @@ -664,266 +471,4 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, # endif return success; } - -/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows - * since Windows driver 101.3268. */ -/* The same min compute-runtime version is currently required across Windows and Linux. - * For Windows driver 101.3430, compute-runtime version is 23904. */ -static const int lowest_supported_driver_version_win = 1013430; -static const int lowest_supported_driver_version_neo = 23904; - -static int parse_driver_build_version(const sycl::device &device) -{ - const std::string &driver_version = device.get_info<sycl::info::device::driver_version>(); - int driver_build_version = 0; - - size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1); - if (second_dot_position == std::string::npos) { - std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version - << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," - << " xx.xx.xxx.xxxx (Windows) for device \"" - << device.get_info<sycl::info::device::name>() << "\"." << std::endl; - } - else { - try { - size_t third_dot_position = driver_version.find('.', second_dot_position + 1); - if (third_dot_position != std::string::npos) { - const std::string &third_number_substr = driver_version.substr( - second_dot_position + 1, third_dot_position - second_dot_position - 1); - const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1); - if (third_number_substr.length() == 3 && forth_number_substr.length() == 4) - driver_build_version = std::stoi(third_number_substr) * 10000 + - std::stoi(forth_number_substr); - } - else { - const std::string &third_number_substr = driver_version.substr(second_dot_position + 1); - driver_build_version = std::stoi(third_number_substr); - } - } - catch (std::invalid_argument &e) { - std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version - << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," - << " xx.xx.xxx.xxxx (Windows) for device \"" - << device.get_info<sycl::info::device::name>() << "\"." << std::endl; - } - } - - return driver_build_version; -} - -static std::vector<sycl::device> oneapi_available_devices() -{ - bool allow_all_devices = false; - if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) - allow_all_devices = true; - - /* Host device is useful only for debugging at the moment - * so we hide this device with default build settings. */ -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - bool allow_host = true; -# else - bool allow_host = false; -# endif - - const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms(); - - std::vector<sycl::device> available_devices; - for (const sycl::platform &platform : oneapi_platforms) { - /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL. - */ - if (platform.get_backend() == sycl::backend::opencl) { - continue; - } - - const std::vector<sycl::device> &oneapi_devices = - (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) : - platform.get_devices(sycl::info::device_type::gpu); - - for (const sycl::device &device : oneapi_devices) { - if (allow_all_devices) { - /* still filter out host device if build doesn't support it. */ - if (allow_host || !device.is_host()) { - available_devices.push_back(device); - } - } - else { - bool filter_out = false; - - /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU, - * assuming they have either more than 96 Execution Units or not 7 threads per EU. - * Official support can be broaden to older and smaller GPUs once ready. */ - if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) { - /* Filtered-out defaults in-case these values aren't available through too old L0 - * runtime. */ - int number_of_eus = 96; - int threads_per_eu = 7; - if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) { - number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>(); - } - if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { - threads_per_eu = - device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>(); - } - /* This filters out all Level-Zero supported GPUs from older generation than Arc. */ - if (number_of_eus <= 96 && threads_per_eu == 7) { - filter_out = true; - } - /* if not already filtered out, check driver version. */ - if (!filter_out) { - int driver_build_version = parse_driver_build_version(device); - if ((driver_build_version > 100000 && - driver_build_version < lowest_supported_driver_version_win) || - driver_build_version < lowest_supported_driver_version_neo) { - filter_out = true; - } - } - } - else if (!allow_host && device.is_host()) { - filter_out = true; - } - else if (!allow_all_devices) { - filter_out = true; - } - - if (!filter_out) { - available_devices.push_back(device); - } - } - } - } - - return available_devices; -} - -char *oneapi_device_capabilities() -{ - std::stringstream capabilities; - - const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices(); - for (const sycl::device &device : oneapi_devices) { - const std::string &name = device.get_info<sycl::info::device::name>(); - - capabilities << std::string("\t") << name << "\n"; -# define WRITE_ATTR(attribute_name, attribute_variable) \ - capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \ - << "\n"; -# define GET_NUM_ATTR(attribute) \ - { \ - size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \ - capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \ - } - - GET_NUM_ATTR(vendor_id) - GET_NUM_ATTR(max_compute_units) - GET_NUM_ATTR(max_work_item_dimensions) - - sycl::id<3> max_work_item_sizes = - device.get_info<sycl::info::device::max_work_item_sizes<3>>(); - WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0))) - WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1))) - WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2))) - - GET_NUM_ATTR(max_work_group_size) - GET_NUM_ATTR(max_num_sub_groups) - GET_NUM_ATTR(sub_group_independent_forward_progress) - - GET_NUM_ATTR(preferred_vector_width_char) - GET_NUM_ATTR(preferred_vector_width_short) - GET_NUM_ATTR(preferred_vector_width_int) - GET_NUM_ATTR(preferred_vector_width_long) - GET_NUM_ATTR(preferred_vector_width_float) - GET_NUM_ATTR(preferred_vector_width_double) - GET_NUM_ATTR(preferred_vector_width_half) - - GET_NUM_ATTR(native_vector_width_char) - GET_NUM_ATTR(native_vector_width_short) - GET_NUM_ATTR(native_vector_width_int) - GET_NUM_ATTR(native_vector_width_long) - GET_NUM_ATTR(native_vector_width_float) - GET_NUM_ATTR(native_vector_width_double) - GET_NUM_ATTR(native_vector_width_half) - - size_t max_clock_frequency = - (size_t)(device.is_host() ? (size_t)0 : - device.get_info<sycl::info::device::max_clock_frequency>()); - WRITE_ATTR("max_clock_frequency", max_clock_frequency) - - GET_NUM_ATTR(address_bits) - GET_NUM_ATTR(max_mem_alloc_size) - - /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't - * supported so we always return false, even if device supports HW texture usage acceleration. - */ - bool image_support = false; - WRITE_ATTR("image_support", (size_t)image_support) - - GET_NUM_ATTR(max_parameter_size) - GET_NUM_ATTR(mem_base_addr_align) - GET_NUM_ATTR(global_mem_size) - GET_NUM_ATTR(local_mem_size) - GET_NUM_ATTR(error_correction_support) - GET_NUM_ATTR(profiling_timer_resolution) - GET_NUM_ATTR(is_available) - -# undef GET_NUM_ATTR -# undef WRITE_ATTR - capabilities << "\n"; - } - - return ::strdup(capabilities.str().c_str()); -} - -void oneapi_free(void *p) -{ - if (p) { - ::free(p); - } -} - -void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr) -{ - int num = 0; - std::vector<sycl::device> devices = oneapi_available_devices(); - for (sycl::device &device : devices) { - const std::string &platform_name = - device.get_platform().get_info<sycl::info::platform::name>(); - std::string name = device.get_info<sycl::info::device::name>(); - std::string id = "ONEAPI_" + platform_name + "_" + name; - if (device.has(sycl::aspect::ext_intel_pci_address)) { - id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>()); - } - (cb)(id.c_str(), name.c_str(), num, user_ptr); - num++; - } -} - -size_t oneapi_get_memcapacity(SyclQueue *queue) -{ - return reinterpret_cast<sycl::queue *>(queue) - ->get_device() - .get_info<sycl::info::device::global_mem_size>(); -} - -int oneapi_get_num_multiprocessors(SyclQueue *queue) -{ - const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device(); - if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) { - return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>(); - } - else - return 0; -} - -int oneapi_get_max_num_threads_per_multiprocessor(SyclQueue *queue) -{ - const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device(); - if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) && - device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { - return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() * - device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>(); - } - else - return 0; -} - #endif /* WITH_ONEAPI */ diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h index c5f853742ed..7456d0e4902 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.h +++ b/intern/cycles/kernel/device/oneapi/kernel.h @@ -25,11 +25,6 @@ enum DeviceKernel : int; class SyclQueue; -typedef void (*OneAPIDeviceIteratorCallback)(const char *id, - const char *name, - int num, - void *user_ptr); - typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr); struct KernelContext { @@ -45,13 +40,15 @@ struct KernelContext { extern "C" { # endif -# define DLL_INTERFACE_CALL(function, return_type, ...) \ - CYCLES_KERNEL_ONEAPI_EXPORT return_type function(__VA_ARGS__); -# include "kernel/device/oneapi/dll_interface_template.h" -# undef DLL_INTERFACE_CALL - +CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_run_test_kernel(SyclQueue *queue_); +CYCLES_KERNEL_ONEAPI_EXPORT void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr); +CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_kernel_preferred_local_size( + SyclQueue *queue, const DeviceKernel kernel, const size_t kernel_global_size); +CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context, + int kernel, + size_t global_size, + void **args); # ifdef __cplusplus } # endif - #endif /* WITH_ONEAPI */ |