1 files changed, 472 insertions, 53 deletions
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp
index dd0622a5bd5..2df605fa047 100644
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -8,7 +8,7 @@
 #  include "util/debug.h"
 #  include "util/log.h"
 
-#  include "kernel/device/oneapi/kernel.h"
+#  include "kernel/device/oneapi/globals.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -19,26 +19,19 @@ static void queue_error_cb(const char *message, void *user_ptr)
   }
 }
 
-OneapiDevice::OneapiDevice(const DeviceInfo &info,
-                           OneAPIDLLInterface &oneapi_dll_object,
-                           Stats &stats,
-                           Profiler &profiler)
+OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
     : Device(info, stats, profiler),
       device_queue_(nullptr),
       texture_info_(this, "texture_info", MEM_GLOBAL),
       kg_memory_(nullptr),
       kg_memory_device_(nullptr),
-      kg_memory_size_(0),
-      oneapi_dll_(oneapi_dll_object)
+      kg_memory_size_(0)
 {
   need_texture_info_ = false;
 
-  oneapi_dll_.oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
+  oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
 
-  /* OneAPI calls should be initialized on this moment. */
-  assert(oneapi_dll_.oneapi_create_queue != nullptr);
-
-  bool is_finished_ok = oneapi_dll_.oneapi_create_queue(device_queue_, info.num);
+  bool is_finished_ok = create_queue(device_queue_, info.num);
   if (is_finished_ok == false) {
     set_error("oneAPI queue initialization error: got runtime exception \"" +
               oneapi_error_string_ + "\"");
@@ -50,7 +43,7 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info,
   }
 
   size_t globals_segment_size;
-  is_finished_ok = oneapi_dll_.oneapi_kernel_globals_size(device_queue_, globals_segment_size);
+  is_finished_ok = kernel_globals_size(device_queue_, globals_segment_size);
   if (is_finished_ok == false) {
     set_error("oneAPI constant memory initialization got runtime exception \"" +
               oneapi_error_string_ + "\"");
@@ -59,27 +52,27 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info,
     VLOG_DEBUG << "Successfully created global/constant memory segment (kernel globals object)";
   }
 
-  kg_memory_ = oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
-  oneapi_dll_.oneapi_usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
+  kg_memory_ = usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
+  usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
 
-  kg_memory_device_ = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, globals_segment_size);
+  kg_memory_device_ = usm_alloc_device(device_queue_, globals_segment_size);
 
   kg_memory_size_ = globals_segment_size;
 
-  max_memory_on_device_ = oneapi_dll_.oneapi_get_memcapacity(device_queue_);
+  max_memory_on_device_ = get_memcapacity();
 }
 
 OneapiDevice::~OneapiDevice()
 {
   texture_info_.free();
-  oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_);
-  oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_device_);
+  usm_free(device_queue_, kg_memory_);
+  usm_free(device_queue_, kg_memory_device_);
 
   for (ConstMemMap::iterator mt = const_mem_map_.begin(); mt != const_mem_map_.end(); mt++)
     delete mt->second;
 
   if (device_queue_)
-    oneapi_dll_.oneapi_free_queue(device_queue_);
+    free_queue(device_queue_);
 }
 
 bool OneapiDevice::check_peer_access(Device * /*peer_device*/)
@@ -99,7 +92,7 @@ bool OneapiDevice::load_kernels(const uint requested_features)
    * with specialization constants, but it hasn't been implemented yet. */
   (void)requested_features;
 
-  bool is_finished_ok = oneapi_dll_.oneapi_run_test_kernel(device_queue_);
+  bool is_finished_ok = oneapi_run_test_kernel(device_queue_);
   if (is_finished_ok == false) {
     set_error("oneAPI kernel load: got runtime exception \"" + oneapi_error_string_ + "\"");
   }
@@ -138,7 +131,7 @@ void OneapiDevice::generic_alloc(device_memory &mem)
    * type has been used for oneAPI device in order to better fit in Cycles architecture. */
   void *device_pointer = nullptr;
   if (mem.memory_size() + stats.mem_used < max_memory_on_device_)
-    device_pointer = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, memory_size);
+    device_pointer = usm_alloc_device(device_queue_, memory_size);
   if (device_pointer == nullptr) {
     set_error("oneAPI kernel - device memory allocation error for " +
               string_human_readable_size(mem.memory_size()) +
@@ -163,8 +156,7 @@ void OneapiDevice::generic_copy_to(device_memory &mem)
   /* Copy operation from host shouldn't be requested if there is no memory allocated on host. */
   assert(mem.host_pointer);
   assert(device_queue_);
-  oneapi_dll_.oneapi_usm_memcpy(
-      device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
+  usm_memcpy(device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
 }
 
 /* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
@@ -178,11 +170,6 @@ string OneapiDevice::oneapi_error_message()
   return string(oneapi_error_string_);
 }
 
-OneAPIDLLInterface OneapiDevice::oneapi_dll_object()
-{
-  return oneapi_dll_;
-}
-
 void *OneapiDevice::kernel_globals_device_pointer()
 {
   return kg_memory_device_;
@@ -198,7 +185,7 @@ void OneapiDevice::generic_free(device_memory &mem)
   mem.device_size = 0;
 
   assert(device_queue_);
-  oneapi_dll_.oneapi_usm_free(device_queue_, (void *)mem.device_pointer);
+  usm_free(device_queue_, (void *)mem.device_pointer);
   mem.device_pointer = 0;
 }
 
@@ -266,8 +253,7 @@ void OneapiDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t
     if (mem.device_pointer) {
       char *shifted_host = reinterpret_cast<char *>(mem.host_pointer) + offset;
       char *shifted_device = reinterpret_cast<char *>(mem.device_pointer) + offset;
-      bool is_finished_ok = oneapi_dll_.oneapi_usm_memcpy(
-          device_queue_, shifted_host, shifted_device, size);
+      bool is_finished_ok = usm_memcpy(device_queue_, shifted_host, shifted_device, size);
       if (is_finished_ok == false) {
         set_error("oneAPI memory operation error: got runtime exception \"" +
                   oneapi_error_string_ + "\"");
@@ -292,7 +278,7 @@ void OneapiDevice::mem_zero(device_memory &mem)
   }
 
   assert(device_queue_);
-  bool is_finished_ok = oneapi_dll_.oneapi_usm_memset(
+  bool is_finished_ok = usm_memset(
       device_queue_, (void *)mem.device_pointer, 0, mem.memory_size());
   if (is_finished_ok == false) {
     set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
@@ -349,10 +335,9 @@ void OneapiDevice::const_copy_to(const char *name, void *host, size_t size)
   memcpy(data->data(), host, size);
   data->copy_to_device();
 
-  oneapi_dll_.oneapi_set_global_memory(
-      device_queue_, kg_memory_, name, (void *)data->device_pointer);
+  set_global_memory(device_queue_, kg_memory_, name, (void *)data->device_pointer);
 
-  oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
+  usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
 }
 
 void OneapiDevice::global_alloc(device_memory &mem)
@@ -367,10 +352,9 @@ void OneapiDevice::global_alloc(device_memory &mem)
   generic_alloc(mem);
   generic_copy_to(mem);
 
-  oneapi_dll_.oneapi_set_global_memory(
-      device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);
+  set_global_memory(device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);
 
-  oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
+  usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
 }
 
 void OneapiDevice::global_free(device_memory &mem)
@@ -410,18 +394,6 @@ unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create()
   return make_unique<OneapiDeviceQueue>(this);
 }
 
-int OneapiDevice::get_num_multiprocessors()
-{
-  assert(device_queue_);
-  return oneapi_dll_.oneapi_get_num_multiprocessors(device_queue_);
-}
-
-int OneapiDevice::get_max_num_threads_per_multiprocessor()
-{
-  assert(device_queue_);
-  return oneapi_dll_.oneapi_get_max_num_threads_per_multiprocessor(device_queue_);
-}
-
 bool OneapiDevice::should_use_graphics_interop()
 {
   /* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so
@@ -432,13 +404,460 @@ bool OneapiDevice::should_use_graphics_interop()
 void *OneapiDevice::usm_aligned_alloc_host(size_t memory_size, size_t alignment)
 {
   assert(device_queue_);
-  return oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, memory_size, alignment);
+  return usm_aligned_alloc_host(device_queue_, memory_size, alignment);
 }
 
 void OneapiDevice::usm_free(void *usm_ptr)
 {
   assert(device_queue_);
-  return oneapi_dll_.oneapi_usm_free(device_queue_, usm_ptr);
+  return usm_free(device_queue_, usm_ptr);
+}
+
+void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
+{
+#  ifdef _DEBUG
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  sycl::info::device_type device_type =
+      queue->get_device().get_info<sycl::info::device::device_type>();
+  sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
+  (void)usm_type;
+  assert(usm_type == sycl::usm::alloc::device ||
+         ((device_type == sycl::info::device_type::host ||
+           device_type == sycl::info::device_type::cpu || allow_host) &&
+          usm_type == sycl::usm::alloc::host));
+#  endif
+}
+
+bool OneapiDevice::create_queue(SyclQueue *&external_queue, int device_index)
+{
+  bool finished_correct = true;
+  try {
+    std::vector<sycl::device> devices = OneapiDevice::available_devices();
+    if (device_index < 0 || device_index >= devices.size()) {
+      return false;
+    }
+    sycl::queue *created_queue = new sycl::queue(devices[device_index],
+                                                 sycl::property::queue::in_order());
+    external_queue = reinterpret_cast<SyclQueue *>(created_queue);
+  }
+  catch (sycl::exception const &e) {
+    finished_correct = false;
+    oneapi_error_string_ = e.what();
+  }
+  return finished_correct;
+}
+
+void OneapiDevice::free_queue(SyclQueue *queue_)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  delete queue;
+}
+
+void *OneapiDevice::usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  return sycl::aligned_alloc_host(alignment, memory_size, *queue);
+}
+
+void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  return sycl::malloc_device(memory_size, *queue);
+}
+
+void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  OneapiDevice::check_usm(queue_, usm_ptr, true);
+  sycl::free(usm_ptr, *queue);
+}
+
+bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  OneapiDevice::check_usm(queue_, dest, true);
+  OneapiDevice::check_usm(queue_, src, true);
+  sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
+#  ifdef WITH_CYCLES_DEBUG
+  try {
+    /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
+     * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
+     */
+    mem_event.wait_and_throw();
+    return true;
+  }
+  catch (sycl::exception const &e) {
+    oneapi_error_string_ = e.what();
+    return false;
+  }
+#  else
+  sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
+  sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
+  bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
+                             src_type == sycl::usm::alloc::device;
+  bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
+                                          src_type == sycl::usm::alloc::unknown;
+  /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
+   * may not wait until the end of the transfer before using the memory.
+   */
+  if (from_device_to_host || host_or_device_memop_with_offset)
+    mem_event.wait();
+  return true;
+#  endif
+}
+
+bool OneapiDevice::usm_memset(SyclQueue *queue_,
+                              void *usm_ptr,
+                              unsigned char value,
+                              size_t num_bytes)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  OneapiDevice::check_usm(queue_, usm_ptr, true);
+  sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
+#  ifdef WITH_CYCLES_DEBUG
+  try {
+    /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
+     * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
+     */
+    mem_event.wait_and_throw();
+    return true;
+  }
+  catch (sycl::exception const &e) {
+    oneapi_error_string_ = e.what();
+    return false;
+  }
+#  else
+  (void)mem_event;
+  return true;
+#  endif
+}
+
+bool OneapiDevice::queue_synchronize(SyclQueue *queue_)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  try {
+    queue->wait_and_throw();
+    return true;
+  }
+  catch (sycl::exception const &e) {
+    oneapi_error_string_ = e.what();
+    return false;
+  }
+}
+
+bool OneapiDevice::kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
+{
+  kernel_global_size = sizeof(KernelGlobalsGPU);
+
+  return true;
+}
+
+void OneapiDevice::set_global_memory(SyclQueue *queue_,
+                                     void *kernel_globals,
+                                     const char *memory_name,
+                                     void *memory_device_pointer)
+{
+  assert(queue_);
+  assert(kernel_globals);
+  assert(memory_name);
+  assert(memory_device_pointer);
+  KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
+  OneapiDevice::check_usm(queue_, memory_device_pointer);
+  OneapiDevice::check_usm(queue_, kernel_globals, true);
+
+  std::string matched_name(memory_name);
+
+/* This macro will change global ptr of KernelGlobals via name matching. */
+#  define KERNEL_DATA_ARRAY(type, name) \
+    else if (#name == matched_name) \
+    { \
+      globals->__##name = (type *)memory_device_pointer; \
+      return; \
+    }
+  if (false) {
+  }
+  else if ("integrator_state" == matched_name) {
+    globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
+    return;
+  }
+  KERNEL_DATA_ARRAY(KernelData, data)
+#  include "kernel/data_arrays.h"
+  else
+  {
+    std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
+              << std::endl;
+    assert(false);
+  }
+#  undef KERNEL_DATA_ARRAY
+}
+
+bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context,
+                                  int kernel,
+                                  size_t global_size,
+                                  void **args)
+{
+  return oneapi_enqueue_kernel(kernel_context, kernel, global_size, args);
+}
+
+/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
+ * since Windows driver 101.3268. */
+/* The same min compute-runtime version is currently required across Windows and Linux.
+ * For Windows driver 101.3430, compute-runtime version is 23904. */
+static const int lowest_supported_driver_version_win = 1013430;
+static const int lowest_supported_driver_version_neo = 23904;
+
+int OneapiDevice::parse_driver_build_version(const sycl::device &device)
+{
+  const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
+  int driver_build_version = 0;
+
+  size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
+  if (second_dot_position == std::string::npos) {
+    std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
+              << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
+              << " xx.xx.xxx.xxxx (Windows) for device \""
+              << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
+  }
+  else {
+    try {
+      size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
+      if (third_dot_position != std::string::npos) {
+        const std::string &third_number_substr = driver_version.substr(
+            second_dot_position + 1, third_dot_position - second_dot_position - 1);
+        const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
+        if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
+          driver_build_version = std::stoi(third_number_substr) * 10000 +
+                                 std::stoi(forth_number_substr);
+      }
+      else {
+        const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
+        driver_build_version = std::stoi(third_number_substr);
+      }
+    }
+    catch (std::invalid_argument &) {
+      std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
+                << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
+                << " xx.xx.xxx.xxxx (Windows) for device \""
+                << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
+    }
+  }
+
+  return driver_build_version;
+}
+
+std::vector<sycl::device> OneapiDevice::available_devices()
+{
+  bool allow_all_devices = false;
+  if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
+    allow_all_devices = true;
+
+    /* Host device is useful only for debugging at the moment
+     * so we hide this device with default build settings. */
+#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+  bool allow_host = true;
+#  else
+  bool allow_host = false;
+#  endif
+
+  const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
+
+  std::vector<sycl::device> available_devices;
+  for (const sycl::platform &platform : oneapi_platforms) {
+    /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
+     */
+    if (platform.get_backend() == sycl::backend::opencl) {
+      continue;
+    }
+
+    const std::vector<sycl::device> &oneapi_devices =
+        (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
+                                            platform.get_devices(sycl::info::device_type::gpu);
+
+    for (const sycl::device &device : oneapi_devices) {
+      if (allow_all_devices) {
+        /* still filter out host device if build doesn't support it. */
+        if (allow_host || !device.is_host()) {
+          available_devices.push_back(device);
+        }
+      }
+      else {
+        bool filter_out = false;
+
+        /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
+         * assuming they have either more than 96 Execution Units or not 7 threads per EU.
+         * Official support can be broaden to older and smaller GPUs once ready. */
+        if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
+          /* Filtered-out defaults in-case these values aren't available through too old L0
+           * runtime. */
+          int number_of_eus = 96;
+          int threads_per_eu = 7;
+          if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
+            number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+          }
+          if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
+            threads_per_eu =
+                device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+          }
+          /* This filters out all Level-Zero supported GPUs from older generation than Arc. */
+          if (number_of_eus <= 96 && threads_per_eu == 7) {
+            filter_out = true;
+          }
+          /* if not already filtered out, check driver version. */
+          if (!filter_out) {
+            int driver_build_version = parse_driver_build_version(device);
+            if ((driver_build_version > 100000 &&
+                 driver_build_version < lowest_supported_driver_version_win) ||
+                driver_build_version < lowest_supported_driver_version_neo) {
+              filter_out = true;
+            }
+          }
+        }
+        else if (!allow_host && device.is_host()) {
+          filter_out = true;
+        }
+        else if (!allow_all_devices) {
+          filter_out = true;
+        }
+
+        if (!filter_out) {
+          available_devices.push_back(device);
+        }
+      }
+    }
+  }
+
+  return available_devices;
+}
+
+char *OneapiDevice::device_capabilities()
+{
+  std::stringstream capabilities;
+
+  const std::vector<sycl::device> &oneapi_devices = available_devices();
+  for (const sycl::device &device : oneapi_devices) {
+    const std::string &name = device.get_info<sycl::info::device::name>();
+
+    capabilities << std::string("\t") << name << "\n";
+#  define WRITE_ATTR(attribute_name, attribute_variable) \
+    capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
+                 << "\n";
+#  define GET_NUM_ATTR(attribute) \
+    { \
+      size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
+      capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
+    }
+
+    GET_NUM_ATTR(vendor_id)
+    GET_NUM_ATTR(max_compute_units)
+    GET_NUM_ATTR(max_work_item_dimensions)
+
+    sycl::id<3> max_work_item_sizes =
+        device.get_info<sycl::info::device::max_work_item_sizes<3>>();
+    WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
+    WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
+    WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
+
+    GET_NUM_ATTR(max_work_group_size)
+    GET_NUM_ATTR(max_num_sub_groups)
+    GET_NUM_ATTR(sub_group_independent_forward_progress)
+
+    GET_NUM_ATTR(preferred_vector_width_char)
+    GET_NUM_ATTR(preferred_vector_width_short)
+    GET_NUM_ATTR(preferred_vector_width_int)
+    GET_NUM_ATTR(preferred_vector_width_long)
+    GET_NUM_ATTR(preferred_vector_width_float)
+    GET_NUM_ATTR(preferred_vector_width_double)
+    GET_NUM_ATTR(preferred_vector_width_half)
+
+    GET_NUM_ATTR(native_vector_width_char)
+    GET_NUM_ATTR(native_vector_width_short)
+    GET_NUM_ATTR(native_vector_width_int)
+    GET_NUM_ATTR(native_vector_width_long)
+    GET_NUM_ATTR(native_vector_width_float)
+    GET_NUM_ATTR(native_vector_width_double)
+    GET_NUM_ATTR(native_vector_width_half)
+
+    size_t max_clock_frequency =
+        (size_t)(device.is_host() ? (size_t)0 :
+                                    device.get_info<sycl::info::device::max_clock_frequency>());
+    WRITE_ATTR("max_clock_frequency", max_clock_frequency)
+
+    GET_NUM_ATTR(address_bits)
+    GET_NUM_ATTR(max_mem_alloc_size)
+
+    /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
+     * supported so we always return false, even if device supports HW texture usage acceleration.
+     */
+    bool image_support = false;
+    WRITE_ATTR("image_support", (size_t)image_support)
+
+    GET_NUM_ATTR(max_parameter_size)
+    GET_NUM_ATTR(mem_base_addr_align)
+    GET_NUM_ATTR(global_mem_size)
+    GET_NUM_ATTR(local_mem_size)
+    GET_NUM_ATTR(error_correction_support)
+    GET_NUM_ATTR(profiling_timer_resolution)
+    GET_NUM_ATTR(is_available)
+
+#  undef GET_NUM_ATTR
+#  undef WRITE_ATTR
+    capabilities << "\n";
+  }
+
+  return ::strdup(capabilities.str().c_str());
+}
+
+void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
+{
+  int num = 0;
+  std::vector<sycl::device> devices = OneapiDevice::available_devices();
+  for (sycl::device &device : devices) {
+    const std::string &platform_name =
+        device.get_platform().get_info<sycl::info::platform::name>();
+    std::string name = device.get_info<sycl::info::device::name>();
+    std::string id = "ONEAPI_" + platform_name + "_" + name;
+    if (device.has(sycl::aspect::ext_intel_pci_address)) {
+      id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
+    }
+    (cb)(id.c_str(), name.c_str(), num, user_ptr);
+    num++;
+  }
+}
+
+size_t OneapiDevice::get_memcapacity()
+{
+  return reinterpret_cast<sycl::queue *>(device_queue_)
+      ->get_device()
+      .get_info<sycl::info::device::global_mem_size>();
+}
+
+int OneapiDevice::get_num_multiprocessors()
+{
+  const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
+  if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
+    return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+  }
+  else
+    return 0;
+}
+
+int OneapiDevice::get_max_num_threads_per_multiprocessor()
+{
+  const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
+  if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
+      device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
+    return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
+           device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+  }
+  else
+    return 0;
 }
 
 CCL_NAMESPACE_END