diff options
Diffstat (limited to 'intern/cycles/kernel/device/oneapi/kernel.cpp')
-rw-r--r-- | intern/cycles/kernel/device/oneapi/kernel.cpp | 566 |
1 files changed, 94 insertions, 472 deletions
diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp index 300e201600c..525ae288f0c 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.cpp +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -3,208 +3,79 @@ #ifdef WITH_ONEAPI -/* clang-format off */ # include "kernel.h" # include <iostream> # include <map> # include <set> -# include <CL/sycl.hpp> +# include <sycl/sycl.hpp> # include "kernel/device/oneapi/compat.h" # include "kernel/device/oneapi/globals.h" # include "kernel/device/oneapi/kernel_templates.h" # include "kernel/device/gpu/kernel.h" -/* clang-format on */ static OneAPIErrorCallback s_error_cb = nullptr; static void *s_error_user_ptr = nullptr; -static std::vector<sycl::device> oneapi_available_devices(); - void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr) { s_error_cb = cb; s_error_user_ptr = user_ptr; } -void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false) -{ -# ifdef _DEBUG - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - sycl::info::device_type device_type = - queue->get_device().get_info<sycl::info::device::device_type>(); - sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context()); - (void)usm_type; - assert(usm_type == sycl::usm::alloc::device || - ((device_type == sycl::info::device_type::host || - device_type == sycl::info::device_type::is_cpu || allow_host) && - usm_type == sycl::usm::alloc::host)); -# endif -} - -bool oneapi_create_queue(SyclQueue *&external_queue, int device_index) -{ - bool finished_correct = true; - try { - std::vector<sycl::device> devices = oneapi_available_devices(); - if (device_index < 0 || device_index >= devices.size()) { - return false; - } - sycl::queue *created_queue = new sycl::queue(devices[device_index], - sycl::property::queue::in_order()); - external_queue = reinterpret_cast<SyclQueue *>(created_queue); - } - catch (sycl::exception const &e) { - finished_correct = false; - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); - } - } - return finished_correct; -} - -void oneapi_free_queue(SyclQueue *queue_) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - delete queue; -} - -void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - return sycl::aligned_alloc_host(alignment, memory_size, *queue); -} - -void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - return sycl::malloc_device(memory_size, *queue); -} - -void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr) +/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality like + * memory allocations, memory transfers and execution of kernel with USM memory. */ +bool oneapi_run_test_kernel(SyclQueue *queue_) { assert(queue_); sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - oneapi_check_usm(queue_, usm_ptr, true); - sycl::free(usm_ptr, *queue); -} + const size_t N = 8; + const size_t memory_byte_size = sizeof(int) * N; -bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - oneapi_check_usm(queue_, dest, true); - oneapi_check_usm(queue_, src, true); - sycl::event mem_event = queue->memcpy(dest, src, num_bytes); -# ifdef WITH_CYCLES_DEBUG + bool is_computation_correct = true; try { - /* NOTE(@nsirgien) Waiting on memory operation may give more precise error - * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug. - */ - mem_event.wait_and_throw(); - return true; - } - catch (sycl::exception const &e) { - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); - } - return false; - } -# else - sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context()); - sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context()); - bool from_device_to_host = dest_type == sycl::usm::alloc::host && - src_type == sycl::usm::alloc::device; - bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown || - src_type == sycl::usm::alloc::unknown; - /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host - * may not wait until the end of the transfer before using the memory. - */ - if (from_device_to_host || host_or_device_memop_with_offset) - mem_event.wait(); - return true; -# endif -} + int *A_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue); -bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - oneapi_check_usm(queue_, usm_ptr, true); - sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes); -# ifdef WITH_CYCLES_DEBUG - try { - /* NOTE(@nsirgien) Waiting on memory operation may give more precise error - * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug. - */ - mem_event.wait_and_throw(); - return true; - } - catch (sycl::exception const &e) { - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); + for (size_t i = (size_t)0; i < N; i++) { + A_host[i] = rand() % 32; } - return false; - } -# else - (void)mem_event; - return true; -# endif -} -bool oneapi_queue_synchronize(SyclQueue *queue_) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - try { - queue->wait_and_throw(); - return true; - } - catch (sycl::exception const &e) { - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); - } - return false; - } -} + int *A_device = (int *)sycl::malloc_device(memory_byte_size, *queue); + int *B_device = (int *)sycl::malloc_device(memory_byte_size, *queue); -/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and - * also trigger runtime compilation of all existing oneAPI kernels */ -bool oneapi_run_test_kernel(SyclQueue *queue_) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - size_t N = 8; - sycl::buffer<float, 1> A(N); - sycl::buffer<float, 1> B(N); - - { - sycl::host_accessor A_host_acc(A, sycl::write_only); - for (size_t i = (size_t)0; i < N; i++) - A_host_acc[i] = rand() % 32; - } + queue->memcpy(A_device, A_host, memory_byte_size); + queue->wait_and_throw(); - try { queue->submit([&](sycl::handler &cgh) { - sycl::accessor A_acc(A, cgh, sycl::read_only); - sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init); - - cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); }); + cgh.parallel_for(N, [=](sycl::id<1> idx) { B_device[idx] = A_device[idx] + idx.get(0); }); }); queue->wait_and_throw(); - sycl::host_accessor A_host_acc(A, sycl::read_only); - sycl::host_accessor B_host_acc(B, sycl::read_only); + int *B_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue); + + queue->memcpy(B_host, B_device, memory_byte_size); + queue->wait_and_throw(); for (size_t i = (size_t)0; i < N; i++) { - float result = A_host_acc[i] + B_host_acc[i]; - (void)result; + const int expected_result = i + A_host[i]; + if (B_host[i] != expected_result) { + is_computation_correct = false; + if (s_error_cb) { + s_error_cb(("Incorrect result in test kernel execution - expected " + + std::to_string(expected_result) + ", got " + std::to_string(B_host[i])) + .c_str(), + s_error_user_ptr); + } + } } + + sycl::free(A_host, *queue); + sycl::free(B_host, *queue); + sycl::free(A_device, *queue); + sycl::free(B_device, *queue); + queue->wait_and_throw(); } catch (sycl::exception const &e) { if (s_error_cb) { @@ -213,63 +84,16 @@ bool oneapi_run_test_kernel(SyclQueue *queue_) return false; } - return true; -} - -bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size) -{ - kernel_global_size = sizeof(KernelGlobalsGPU); - - return true; -} - -void oneapi_set_global_memory(SyclQueue *queue_, - void *kernel_globals, - const char *memory_name, - void *memory_device_pointer) -{ - assert(queue_); - assert(kernel_globals); - assert(memory_name); - assert(memory_device_pointer); - KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals; - oneapi_check_usm(queue_, memory_device_pointer); - oneapi_check_usm(queue_, kernel_globals, true); - - std::string matched_name(memory_name); - -/* This macro will change global ptr of KernelGlobals via name matching. */ -# define KERNEL_DATA_ARRAY(type, name) \ - else if (#name == matched_name) \ - { \ - globals->__##name = (type *)memory_device_pointer; \ - return; \ - } - if (false) { - } - else if ("integrator_state" == matched_name) { - globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer; - return; - } - KERNEL_DATA_ARRAY(KernelData, data) -# include "kernel/data_arrays.h" - else - { - std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!" - << std::endl; - assert(false); - } -# undef KERNEL_DATA_ARRAY + return is_computation_correct; } /* TODO: Move device information to OneapiDevice initialized on creation and use it. */ /* TODO: Move below function to oneapi/queue.cpp. */ -size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_, +size_t oneapi_kernel_preferred_local_size(SyclQueue *queue, const DeviceKernel kernel, const size_t kernel_global_size) { - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + assert(queue); (void)kernel_global_size; const static size_t preferred_work_group_size_intersect_shading = 32; const static size_t preferred_work_group_size_technical = 1024; @@ -311,11 +135,63 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_, preferred_work_group_size = 512; } - const size_t limit_work_group_size = - queue->get_device().get_info<sycl::info::device::max_work_group_size>(); + const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue) + ->get_device() + .get_info<sycl::info::device::max_work_group_size>(); + return std::min(limit_work_group_size, preferred_work_group_size); } +bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features) +{ +# ifdef SYCL_SKIP_KERNELS_PRELOAD + (void)queue_; + (void)requested_features; +# else + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + + try { + sycl::kernel_bundle<sycl::bundle_state::input> all_kernels_bundle = + sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), + {queue->get_device()}); + + for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) { + const std::string &kernel_name = kernel_id.get_name(); + + /* NOTE(@nsirgien): Names in this conditions below should match names from + * oneapi_call macro in oneapi_enqueue_kernel below */ + if (((requested_features & KERNEL_FEATURE_VOLUME) == 0) && + kernel_name.find("oneapi_kernel_integrator_shade_volume") != std::string::npos) { + continue; + } + + if (((requested_features & KERNEL_FEATURE_MNEE) == 0) && + kernel_name.find("oneapi_kernel_integrator_shade_surface_mnee") != std::string::npos) { + continue; + } + + if (((requested_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) && + kernel_name.find("oneapi_kernel_integrator_shade_surface_raytrace") != + std::string::npos) { + continue; + } + + sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle = + sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), {kernel_id}); + sycl::build(one_kernel_bundle); + } + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + return false; + } +# endif + return true; +} + bool oneapi_enqueue_kernel(KernelContext *kernel_context, int kernel, size_t global_size, @@ -354,13 +230,6 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices, * we extend work size to fit uniformity requirements. */ global_size = groups_count * local_size; - -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - if (queue->get_device().is_host()) { - global_size = 1; - local_size = 1; - } -# endif } /* Let the compiler throw an error if there are any kernels missing in this implementation. */ @@ -645,13 +514,9 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, /* Unsupported kernels */ case DEVICE_KERNEL_NUM: case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL: - assert(0); - return false; + kernel_assert(0); + break; } - - /* Unknown kernel. */ - assert(0); - return false; }); } catch (sycl::exception const &e) { @@ -668,247 +533,4 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, # endif return success; } - -static const int lowest_supported_driver_version_win = 1011660; -static const int lowest_supported_driver_version_neo = 23570; - -static int parse_driver_build_version(const sycl::device &device) -{ - const std::string &driver_version = device.get_info<sycl::info::device::driver_version>(); - int driver_build_version = 0; - - size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1); - if (second_dot_position == std::string::npos) { - std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version - << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," - << " xx.xx.xxx.xxxx (Windows) for device \"" - << device.get_info<sycl::info::device::name>() << "\"." << std::endl; - } - else { - try { - size_t third_dot_position = driver_version.find('.', second_dot_position + 1); - if (third_dot_position != std::string::npos) { - const std::string &third_number_substr = driver_version.substr( - second_dot_position + 1, third_dot_position - second_dot_position - 1); - const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1); - if (third_number_substr.length() == 3 && forth_number_substr.length() == 4) - driver_build_version = std::stoi(third_number_substr) * 10000 + - std::stoi(forth_number_substr); - } - else { - const std::string &third_number_substr = driver_version.substr(second_dot_position + 1); - driver_build_version = std::stoi(third_number_substr); - } - } - catch (std::invalid_argument &e) { - std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version - << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," - << " xx.xx.xxx.xxxx (Windows) for device \"" - << device.get_info<sycl::info::device::name>() << "\"." << std::endl; - } - } - - return driver_build_version; -} - -static std::vector<sycl::device> oneapi_available_devices() -{ - bool allow_all_devices = false; - if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) - allow_all_devices = true; - - /* Host device is useful only for debugging at the moment - * so we hide this device with default build settings. */ -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - bool allow_host = true; -# else - bool allow_host = false; -# endif - - const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms(); - - std::vector<sycl::device> available_devices; - for (const sycl::platform &platform : oneapi_platforms) { - /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL. - */ - if (platform.get_backend() == sycl::backend::opencl) { - continue; - } - - const std::vector<sycl::device> &oneapi_devices = - (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) : - platform.get_devices(sycl::info::device_type::gpu); - - for (const sycl::device &device : oneapi_devices) { - if (allow_all_devices) { - /* still filter out host device if build doesn't support it. */ - if (allow_host || !device.is_host()) { - available_devices.push_back(device); - } - } - else { - bool filter_out = false; - - /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU, - * assuming they have either more than 96 Execution Units or not 7 threads per EU. - * Official support can be broaden to older and smaller GPUs once ready. */ - if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) { - /* Filtered-out defaults in-case these values aren't available through too old L0 - * runtime. */ - int number_of_eus = 96; - int threads_per_eu = 7; - if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) { - number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>(); - } - if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { - threads_per_eu = - device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>(); - } - /* This filters out all Level-Zero supported GPUs from older generation than Arc. */ - if (number_of_eus <= 96 && threads_per_eu == 7) { - filter_out = true; - } - /* if not already filtered out, check driver version. */ - if (!filter_out) { - int driver_build_version = parse_driver_build_version(device); - if ((driver_build_version > 100000 && - driver_build_version < lowest_supported_driver_version_win) || - (driver_build_version > 0 && - driver_build_version < lowest_supported_driver_version_neo)) { - filter_out = true; - } - } - } - else if (!allow_host && device.is_host()) { - filter_out = true; - } - else if (!allow_all_devices) { - filter_out = true; - } - - if (!filter_out) { - available_devices.push_back(device); - } - } - } - } - - return available_devices; -} - -char *oneapi_device_capabilities() -{ - std::stringstream capabilities; - - const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices(); - for (const sycl::device &device : oneapi_devices) { - const std::string &name = device.get_info<sycl::info::device::name>(); - - capabilities << std::string("\t") << name << "\n"; -# define WRITE_ATTR(attribute_name, attribute_variable) \ - capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \ - << "\n"; -# define GET_NUM_ATTR(attribute) \ - { \ - size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \ - capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \ - } - - GET_NUM_ATTR(vendor_id) - GET_NUM_ATTR(max_compute_units) - GET_NUM_ATTR(max_work_item_dimensions) - - sycl::id<3> max_work_item_sizes = device.get_info<sycl::info::device::max_work_item_sizes>(); - WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0))) - WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1))) - WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2))) - - GET_NUM_ATTR(max_work_group_size) - GET_NUM_ATTR(max_num_sub_groups) - GET_NUM_ATTR(sub_group_independent_forward_progress) - - GET_NUM_ATTR(preferred_vector_width_char) - GET_NUM_ATTR(preferred_vector_width_short) - GET_NUM_ATTR(preferred_vector_width_int) - GET_NUM_ATTR(preferred_vector_width_long) - GET_NUM_ATTR(preferred_vector_width_float) - GET_NUM_ATTR(preferred_vector_width_double) - GET_NUM_ATTR(preferred_vector_width_half) - - GET_NUM_ATTR(native_vector_width_char) - GET_NUM_ATTR(native_vector_width_short) - GET_NUM_ATTR(native_vector_width_int) - GET_NUM_ATTR(native_vector_width_long) - GET_NUM_ATTR(native_vector_width_float) - GET_NUM_ATTR(native_vector_width_double) - GET_NUM_ATTR(native_vector_width_half) - - size_t max_clock_frequency = - (size_t)(device.is_host() ? (size_t)0 : - device.get_info<sycl::info::device::max_clock_frequency>()); - WRITE_ATTR("max_clock_frequency", max_clock_frequency) - - GET_NUM_ATTR(address_bits) - GET_NUM_ATTR(max_mem_alloc_size) - - /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't - * supported so we always return false, even if device supports HW texture usage acceleration. - */ - bool image_support = false; - WRITE_ATTR("image_support", (size_t)image_support) - - GET_NUM_ATTR(max_parameter_size) - GET_NUM_ATTR(mem_base_addr_align) - GET_NUM_ATTR(global_mem_size) - GET_NUM_ATTR(local_mem_size) - GET_NUM_ATTR(error_correction_support) - GET_NUM_ATTR(profiling_timer_resolution) - GET_NUM_ATTR(is_available) - -# undef GET_NUM_ATTR -# undef WRITE_ATTR - capabilities << "\n"; - } - - return ::strdup(capabilities.str().c_str()); -} - -void oneapi_free(void *p) -{ - if (p) { - ::free(p); - } -} - -void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr) -{ - int num = 0; - std::vector<sycl::device> devices = oneapi_available_devices(); - for (sycl::device &device : devices) { - const std::string &platform_name = - device.get_platform().get_info<sycl::info::platform::name>(); - std::string name = device.get_info<sycl::info::device::name>(); - std::string id = "ONEAPI_" + platform_name + "_" + name; - if (device.has(sycl::aspect::ext_intel_pci_address)) { - id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>()); - } - (cb)(id.c_str(), name.c_str(), num, user_ptr); - num++; - } -} - -size_t oneapi_get_memcapacity(SyclQueue *queue) -{ - return reinterpret_cast<sycl::queue *>(queue) - ->get_device() - .get_info<sycl::info::device::global_mem_size>(); -} - -size_t oneapi_get_compute_units_amount(SyclQueue *queue) -{ - return reinterpret_cast<sycl::queue *>(queue) - ->get_device() - .get_info<sycl::info::device::max_compute_units>(); -} - #endif /* WITH_ONEAPI */ |