From 305b92e05f748a0fd9cb62b9829791d717ba2d57 Mon Sep 17 00:00:00 2001
From: Xavier Hallade <xavier.hallade@intel.com>
Date: Fri, 21 Oct 2022 14:10:25 +0200
Subject: Cycles: oneAPI: remove use of SYCL host device

Host device is deprecated in SYCL 2020 spec, cpu device or standard C++
should be used instead.
---
 CMakeLists.txt                                     |  2 -
 intern/cycles/device/oneapi/device.cpp             |  4 +-
 intern/cycles/device/oneapi/device_impl.cpp        | 30 +++------------
 intern/cycles/kernel/CMakeLists.txt                |  4 --
 .../kernel/device/gpu/parallel_active_index.h      | 33 +++-------------
 intern/cycles/kernel/device/oneapi/compat.h        | 45 +++++-----------------
 intern/cycles/kernel/device/oneapi/globals.h       |  9 -----
 intern/cycles/kernel/device/oneapi/kernel.cpp      |  7 ----
 8 files changed, 21 insertions(+), 113 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9134c7c1ed6..a3ea162d040 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -489,14 +489,12 @@ endif()
 if(NOT APPLE)
   option(WITH_CYCLES_DEVICE_ONEAPI "Enable Cycles oneAPI compute support" OFF)
   option(WITH_CYCLES_ONEAPI_BINARIES "Enable Ahead-Of-Time compilation for Cycles oneAPI device" OFF)
-  option(WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED "Enable use of SYCL host (CPU) device execution by oneAPI implementation. This option is for debugging purposes and impacts GPU execution." OFF)
 
   # https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html
   # acm-g10 is the architecture for the first Arc Alchemist GPUs but we'll keep using dg2 until IGC dependency is updated to support acm-g10.
   set(CYCLES_ONEAPI_SPIR64_GEN_DEVICES "dg2" CACHE STRING "oneAPI Intel GPU architectures to build binaries for")
   set(CYCLES_ONEAPI_SYCL_TARGETS spir64 spir64_gen CACHE STRING "oneAPI targets to build AOT binaries for")
 
-  mark_as_advanced(WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED)
   mark_as_advanced(CYCLES_ONEAPI_SPIR64_GEN_DEVICES)
   mark_as_advanced(CYCLES_ONEAPI_SYCL_TARGETS)
 endif()
diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp
index f303ab41627..66d6f749e30 100644
--- a/intern/cycles/device/oneapi/device.cpp
+++ b/intern/cycles/device/oneapi/device.cpp
@@ -39,7 +39,7 @@ bool device_oneapi_init()
     _putenv_s("SYCL_CACHE_THRESHOLD", "0");
   }
   if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
-    _putenv_s("SYCL_DEVICE_FILTER", "host,level_zero");
+    _putenv_s("SYCL_DEVICE_FILTER", "level_zero");
   }
   if (getenv("SYCL_ENABLE_PCI") == nullptr) {
     _putenv_s("SYCL_ENABLE_PCI", "1");
@@ -50,7 +50,7 @@ bool device_oneapi_init()
 #  elif __linux__
   setenv("SYCL_CACHE_PERSISTENT", "1", false);
   setenv("SYCL_CACHE_THRESHOLD", "0", false);
-  setenv("SYCL_DEVICE_FILTER", "host,level_zero", false);
+  setenv("SYCL_DEVICE_FILTER", "level_zero", false);
   setenv("SYCL_ENABLE_PCI", "1", false);
   setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
 #  endif
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp
index f14eada071d..4e7849e6b9a 100644
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -430,8 +430,7 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_
   sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
   (void)usm_type;
   assert(usm_type == sycl::usm::alloc::device ||
-         ((device_type == sycl::info::device_type::host ||
-           device_type == sycl::info::device_type::cpu || allow_host) &&
+         ((device_type == sycl::info::device_type::cpu || allow_host) &&
               usm_type == sycl::usm::alloc::host ||
           usm_type == sycl::usm::alloc::unknown));
 #  else
@@ -672,14 +671,6 @@ std::vector<sycl::device> OneapiDevice::available_devices()
   if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
     allow_all_devices = true;
 
-    /* Host device is useful only for debugging at the moment
-     * so we hide this device with default build settings. */
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-  bool allow_host = true;
-#  else
-  bool allow_host = false;
-#  endif
-
   const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
 
   std::vector<sycl::device> available_devices;
@@ -691,17 +682,11 @@ std::vector<sycl::device> OneapiDevice::available_devices()
     }
 
     const std::vector<sycl::device> &oneapi_devices =
-        (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
-                                            platform.get_devices(sycl::info::device_type::gpu);
+        (allow_all_devices) ? platform.get_devices(sycl::info::device_type::all) :
+                              platform.get_devices(sycl::info::device_type::gpu);
 
     for (const sycl::device &device : oneapi_devices) {
-      if (allow_all_devices) {
-        /* still filter out host device if build doesn't support it. */
-        if (allow_host || !device.is_host()) {
-          available_devices.push_back(device);
-        }
-      }
-      else {
+      if (!allow_all_devices) {
         bool filter_out = false;
 
         /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
@@ -733,9 +718,6 @@ std::vector<sycl::device> OneapiDevice::available_devices()
             }
           }
         }
-        else if (!allow_host && device.is_host()) {
-          filter_out = true;
-        }
         else if (!allow_all_devices) {
           filter_out = true;
         }
@@ -798,9 +780,7 @@ char *OneapiDevice::device_capabilities()
     GET_NUM_ATTR(native_vector_width_double)
     GET_NUM_ATTR(native_vector_width_half)
 
-    size_t max_clock_frequency =
-        (size_t)(device.is_host() ? (size_t)0 :
-                                    device.get_info<sycl::info::device::max_clock_frequency>());
+    size_t max_clock_frequency = device.get_info<sycl::info::device::max_clock_frequency>();
     WRITE_ATTR("max_clock_frequency", max_clock_frequency)
 
     GET_NUM_ATTR(address_bits)
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 1e69d14b1b7..b6a53117a3b 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -752,10 +752,6 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
       ${SYCL_CPP_FLAGS}
       )
 
-  if (WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED)
-    list(APPEND sycl_compiler_flags -DWITH_ONEAPI_SYCL_HOST_ENABLED)
-  endif()
-
   # Set defaults for spir64 and spir64_gen options
   if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64)
     set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'")
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index c1df49c4f49..38cdcb572eb 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -23,22 +23,6 @@ CCL_NAMESPACE_BEGIN
  * and keep device specific code in compat.h */
 
 #ifdef __KERNEL_ONEAPI__
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-template<typename IsActiveOp>
-void cpu_serial_active_index_array_impl(const uint num_states,
-                                        ccl_global int *ccl_restrict indices,
-                                        ccl_global int *ccl_restrict num_indices,
-                                        IsActiveOp is_active_op)
-{
-  int write_index = 0;
-  for (int state_index = 0; state_index < num_states; state_index++) {
-    if (is_active_op(state_index))
-      indices[write_index++] = state_index;
-  }
-  *num_indices = write_index;
-  return;
-}
-#  endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */
 
 template<typename IsActiveOp>
 void gpu_parallel_active_index_array_impl(const uint num_states,
@@ -182,18 +166,11 @@ __device__
                                          num_simd_groups, \
                                          simdgroup_offset)
 #elif defined(__KERNEL_ONEAPI__)
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-#    define gpu_parallel_active_index_array( \
-        blocksize, num_states, indices, num_indices, is_active_op) \
-      if (ccl_gpu_global_size_x() == 1) \
-        cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \
-      else \
-        gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op);
-#  else
-#    define gpu_parallel_active_index_array( \
-        blocksize, num_states, indices, num_indices, is_active_op) \
-      gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
-#  endif
+
+#  define gpu_parallel_active_index_array( \
+      blocksize, num_states, indices, num_indices, is_active_op) \
+    gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
+
 #else
 
 #  define gpu_parallel_active_index_array( \
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h
index 8ae40b0612e..dfaec65130c 100644
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -55,18 +55,6 @@
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers)
 #define ccl_gpu_kernel_threads(block_num_threads)
 
-#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-#  define KG_ND_ITEMS \
-  kg->nd_item_local_id_0 = item.get_local_id(0); \
-  kg->nd_item_local_range_0 = item.get_local_range(0); \
-  kg->nd_item_group_0 = item.get_group(0); \
-  kg->nd_item_group_range_0 = item.get_group_range(0); \
-  kg->nd_item_global_id_0 = item.get_global_id(0); \
-  kg->nd_item_global_range_0 = item.get_global_range(0);
-#else
-# define KG_ND_ITEMS
-#endif
-
 #define ccl_gpu_kernel_signature(name, ...) \
 void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
                           size_t kernel_global_size, \
@@ -76,8 +64,7 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
       (kg); \
       cgh.parallel_for<class kernel_##name>( \
           sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
-          [=](sycl::nd_item<1> item) { \
-            KG_ND_ITEMS
+          [=](sycl::nd_item<1> item) {
 
 #define ccl_gpu_kernel_postfix \
           }); \
@@ -95,31 +82,17 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
   } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)
 
 /* GPU thread, block, grid size and index */
-#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED
-#  define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
-#  define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
-#  define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
-#  define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
-#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
-#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-#  define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
-#  define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
-#else
-#  define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
-#  define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
-#  define ccl_gpu_block_idx_x (kg->nd_item_group_0)
-#  define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
-#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
-#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-#  define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
-#  define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
-#endif
+#define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
+#define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
+#define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
+#define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
+#define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
 
+#define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
+#define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
 
 /* GPU warp synchronization */
-
 #define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
 #define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
 #ifdef __SYCL_DEVICE_ONLY__
diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h
index d60f4f135ba..116620eb725 100644
--- a/intern/cycles/kernel/device/oneapi/globals.h
+++ b/intern/cycles/kernel/device/oneapi/globals.h
@@ -23,15 +23,6 @@ typedef struct KernelGlobalsGPU {
 #undef KERNEL_DATA_ARRAY
   IntegratorStateGPU *integrator_state;
   const KernelData *__data;
-#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-  size_t nd_item_local_id_0;
-  size_t nd_item_local_range_0;
-  size_t nd_item_group_0;
-  size_t nd_item_group_range_0;
-
-  size_t nd_item_global_id_0;
-  size_t nd_item_global_range_0;
-#endif
 } KernelGlobalsGPU;
 
 typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals;
diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp
index 1f32d3406ea..525ae288f0c 100644
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -230,13 +230,6 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
     /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
      * we extend work size to fit uniformity requirements. */
     global_size = groups_count * local_size;
-
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-    if (queue->get_device().is_host()) {
-      global_size = 1;
-      local_size = 1;
-    }
-#  endif
   }
 
   /* Let the compiler throw an error if there are any kernels missing in this implementation. */
-- 
cgit v1.2.3