16 files changed, 109 insertions, 84 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 5516e97f34f..5296d819e42 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -187,18 +187,22 @@ if(WITH_CYCLES_DEVICE_METAL)
   )
 endif()
 if (WITH_CYCLES_DEVICE_ONEAPI)
+  if(WITH_CYCLES_ONEAPI_BINARIES)
+    set(cycles_kernel_oneapi_lib_suffix "_aot")
+  else()
+    set(cycles_kernel_oneapi_lib_suffix "_jit")
+  endif()
   if(WIN32)
-    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi.lib)
+    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.lib)
   else()
-    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi.so)
+    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.so)
+  endif()
+  list(APPEND LIB ${cycles_kernel_oneapi_lib})
+  if(WIN32)
+    list(APPEND LIB debug ${SYCL_LIBRARY_DEBUG} optimized ${SYCL_LIBRARY})
+  else()
+    list(APPEND LIB ${SYCL_LIBRARY})
   endif()
-  list(APPEND LIB
-    ${cycles_kernel_oneapi_lib}
-    "$<$<CONFIG:Debug>:${SYCL_LIBRARY_DEBUG}>"
-    "$<$<CONFIG:Release>:${SYCL_LIBRARY}>"
-    "$<$<CONFIG:RelWithDebInfo>:${SYCL_LIBRARY}>"
-    "$<$<CONFIG:MinSizeRel>:${SYCL_LIBRARY}>"
-  )
   add_definitions(-DWITH_ONEAPI)
   list(APPEND SRC
     ${SRC_ONEAPI}
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index 84b0a1e0dd6..69fae03e32c 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -49,7 +49,7 @@ int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
   return num_states;
 }
 
-int CUDADeviceQueue::num_concurrent_busy_states() const
+int CUDADeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
 {
   const int max_num_threads = cuda_device_->get_num_multiprocessors() *
                               cuda_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
index b450f5b3592..7107afe70c9 100644
--- a/intern/cycles/device/cuda/queue.h
+++ b/intern/cycles/device/cuda/queue.h
@@ -23,7 +23,7 @@ class CUDADeviceQueue : public DeviceQueue {
   ~CUDADeviceQueue();
 
   virtual int num_concurrent_states(const size_t state_size) const override;
-  virtual int num_concurrent_busy_states() const override;
+  virtual int num_concurrent_busy_states(const size_t state_size) const override;
 
   virtual void init_execution() override;
 
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
index 3f8b6267100..e93a9b4df3a 100644
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@@ -49,7 +49,7 @@ int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const
   return num_states;
 }
 
-int HIPDeviceQueue::num_concurrent_busy_states() const
+int HIPDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
 {
   const int max_num_threads = hip_device_->get_num_multiprocessors() *
                               hip_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h
index 729d8a19acb..df0678108af 100644
--- a/intern/cycles/device/hip/queue.h
+++ b/intern/cycles/device/hip/queue.h
@@ -23,7 +23,7 @@ class HIPDeviceQueue : public DeviceQueue {
   ~HIPDeviceQueue();
 
   virtual int num_concurrent_states(const size_t state_size) const override;
-  virtual int num_concurrent_busy_states() const override;
+  virtual int num_concurrent_busy_states(const size_t state_size) const override;
 
   virtual void init_execution() override;
 
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index 6a16d4bb3b4..6f1042b1e55 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -296,9 +296,11 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
   }
 
   source = global_defines + source;
+#  if 0
   metal_printf("================\n%s================\n\%s================\n",
                global_defines.c_str(),
                baked_constants.c_str());
+#  endif
 
   /* Generate an MD5 from the source and include any baked constants. This is used when caching
    * PSOs. */
@@ -339,6 +341,14 @@ bool MetalDevice::compile_and_load(MetalPipelineType pso_type)
 
   MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
 
+#  if defined(MAC_OS_VERSION_13_0)
+  if (@available(macos 13.0, *)) {
+    if (device_vendor == METAL_GPU_INTEL) {
+      [options setOptimizationLevel:MTLLibraryOptimizationLevelSize];
+    }
+  }
+#  endif
+
   options.fastMathEnabled = YES;
   if (@available(macOS 12.0, *)) {
     options.languageVersion = MTLLanguageVersion2_4;
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index 5e0cb6d18f4..55938d1a03a 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -162,6 +162,13 @@ bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
     }
   }
 
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
+    if ((device->kernel_features & KERNEL_FEATURE_MNEE) == 0) {
+      /* Skip shade_surface_mnee kernel if the scene doesn't require it. */
+      return false;
+    }
+  }
+
   if (pso_type != PSO_GENERIC) {
     /* Only specialize kernels where it can make an impact. */
     if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
@@ -317,6 +324,12 @@ bool MetalKernelPipeline::should_use_binary_archive() const
       }
     }
 
+    /* Workaround for Intel GPU having issue using Binary Archives */
+    MetalGPUVendor gpu_vendor = MetalInfo::get_device_vendor(mtlDevice);
+    if (gpu_vendor == METAL_GPU_INTEL) {
+      return false;
+    }
+
     if (pso_type == PSO_GENERIC) {
       /* Archive the generic kernels. */
       return true;
diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index fc32740f3e1..2a6c12e2a60 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -23,7 +23,7 @@ class MetalDeviceQueue : public DeviceQueue {
   ~MetalDeviceQueue();
 
   virtual int num_concurrent_states(const size_t) const override;
-  virtual int num_concurrent_busy_states() const override;
+  virtual int num_concurrent_busy_states(const size_t) const override;
   virtual int num_sort_partition_elements() const override;
 
   virtual void init_execution() override;
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index 5ac63a16c61..c0df2c8553f 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -264,33 +264,46 @@ MetalDeviceQueue::~MetalDeviceQueue()
   }
 }
 
-int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const
 {
-  /* METAL_WIP */
-  /* TODO: compute automatically. */
-  /* TODO: must have at least num_threads_per_block. */
-  int result = 1048576;
-  if (metal_device_->device_vendor == METAL_GPU_AMD) {
-    result *= 2;
+  static int result = 0;
+  if (result) {
+    return result;
   }
-  else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
+
+  result = 1048576;
+  if (metal_device_->device_vendor == METAL_GPU_APPLE) {
     result *= 4;
+
+    if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) == APPLE_M2) {
+      size_t system_ram = system_physical_ram();
+      size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize];
+      size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize];
+
+      /* Determine whether we can double the state count, and leave enough GPU-available memory
+       * (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to
+       * keep dispatch sizes high and minimize work submission overheads. */
+      size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024));
+      size_t total_state_size = result * state_size;
+      if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) {
+        result *= 2;
+        metal_printf("Doubling state count to exploit available RAM (new size = %d)\n", result);
+      }
+    }
+  }
+  else if (metal_device_->device_vendor == METAL_GPU_AMD) {
+    /* METAL_WIP */
+    /* TODO: compute automatically. */
+    /* TODO: must have at least num_threads_per_block. */
+    result *= 2;
   }
   return result;
 }
 
-int MetalDeviceQueue::num_concurrent_busy_states() const
+int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const
 {
-  /* METAL_WIP */
-  /* TODO: compute automatically. */
-  int result = 65536;
-  if (metal_device_->device_vendor == METAL_GPU_AMD) {
-    result *= 2;
-  }
-  else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
-    result *= 4;
-  }
-  return result;
+  /* A 1:4 busy:total ratio gives best rendering performance, independent of total state count. */
+  return num_concurrent_states(state_size) / 4;
 }
 
 int MetalDeviceQueue::num_sort_partition_elements() const
diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm
index 65c67c400fe..f47638fac15 100644
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -110,6 +110,12 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
       usable |= (vendor == METAL_GPU_AMD);
     }
 
+#  if defined(MAC_OS_VERSION_13_0)
+    if (@available(macos 13.0, *)) {
+      usable |= (vendor == METAL_GPU_INTEL);
+    }
+#  endif
+
     if (usable) {
       metal_printf("- %s\n", device_name.c_str());
       [device retain];
diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp
index f303ab41627..66d6f749e30 100644
--- a/intern/cycles/device/oneapi/device.cpp
+++ b/intern/cycles/device/oneapi/device.cpp
@@ -39,7 +39,7 @@ bool device_oneapi_init()
     _putenv_s("SYCL_CACHE_THRESHOLD", "0");
   }
   if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
-    _putenv_s("SYCL_DEVICE_FILTER", "host,level_zero");
+    _putenv_s("SYCL_DEVICE_FILTER", "level_zero");
   }
   if (getenv("SYCL_ENABLE_PCI") == nullptr) {
     _putenv_s("SYCL_ENABLE_PCI", "1");
@@ -50,7 +50,7 @@ bool device_oneapi_init()
 #  elif __linux__
   setenv("SYCL_CACHE_PERSISTENT", "1", false);
   setenv("SYCL_CACHE_THRESHOLD", "0", false);
-  setenv("SYCL_DEVICE_FILTER", "host,level_zero", false);
+  setenv("SYCL_DEVICE_FILTER", "level_zero", false);
   setenv("SYCL_ENABLE_PCI", "1", false);
   setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
 #  endif
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp
index 91f53fd1eae..d0ddd69289c 100644
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -430,9 +430,9 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_
   sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
   (void)usm_type;
   assert(usm_type == sycl::usm::alloc::device ||
-         ((device_type == sycl::info::device_type::host ||
-           device_type == sycl::info::device_type::cpu || allow_host) &&
-          usm_type == sycl::usm::alloc::host));
+         ((device_type == sycl::info::device_type::cpu || allow_host) &&
+              usm_type == sycl::usm::alloc::host ||
+          usm_type == sycl::usm::alloc::unknown));
 #  else
   /* Silence warning about unused arguments. */
   (void)queue_;
@@ -668,16 +668,9 @@ int OneapiDevice::parse_driver_build_version(const sycl::device &device)
 std::vector<sycl::device> OneapiDevice::available_devices()
 {
   bool allow_all_devices = false;
-  if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
+  if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) {
     allow_all_devices = true;
-
-    /* Host device is useful only for debugging at the moment
-     * so we hide this device with default build settings. */
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-  bool allow_host = true;
-#  else
-  bool allow_host = false;
-#  endif
+  }
 
   const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
 
@@ -690,33 +683,28 @@ std::vector<sycl::device> OneapiDevice::available_devices()
     }
 
     const std::vector<sycl::device> &oneapi_devices =
-        (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
-                                            platform.get_devices(sycl::info::device_type::gpu);
+        (allow_all_devices) ? platform.get_devices(sycl::info::device_type::all) :
+                              platform.get_devices(sycl::info::device_type::gpu);
 
     for (const sycl::device &device : oneapi_devices) {
-      if (allow_all_devices) {
-        /* still filter out host device if build doesn't support it. */
-        if (allow_host || !device.is_host()) {
-          available_devices.push_back(device);
-        }
-      }
-      else {
-        bool filter_out = false;
-
+      bool filter_out = false;
+      if (!allow_all_devices) {
         /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
          * assuming they have either more than 96 Execution Units or not 7 threads per EU.
          * Official support can be broaden to older and smaller GPUs once ready. */
-        if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
-          /* Filtered-out defaults in-case these values aren't available through too old L0
-           * runtime. */
+        if (!device.is_gpu() || platform.get_backend() != sycl::backend::ext_oneapi_level_zero) {
+          filter_out = true;
+        }
+        else {
+          /* Filtered-out defaults in-case these values aren't available. */
           int number_of_eus = 96;
           int threads_per_eu = 7;
           if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
-            number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+            number_of_eus = device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
           }
           if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
             threads_per_eu =
-                device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+                device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
           }
           /* This filters out all Level-Zero supported GPUs from older generation than Arc. */
           if (number_of_eus <= 96 && threads_per_eu == 7) {
@@ -732,16 +720,9 @@ std::vector<sycl::device> OneapiDevice::available_devices()
             }
           }
         }
-        else if (!allow_host && device.is_host()) {
-          filter_out = true;
-        }
-        else if (!allow_all_devices) {
-          filter_out = true;
-        }
-
-        if (!filter_out) {
-          available_devices.push_back(device);
-        }
+      }
+      if (!filter_out) {
+        available_devices.push_back(device);
       }
     }
   }
@@ -797,9 +778,7 @@ char *OneapiDevice::device_capabilities()
     GET_NUM_ATTR(native_vector_width_double)
     GET_NUM_ATTR(native_vector_width_half)
 
-    size_t max_clock_frequency =
-        (size_t)(device.is_host() ? (size_t)0 :
-                                    device.get_info<sycl::info::device::max_clock_frequency>());
+    size_t max_clock_frequency = device.get_info<sycl::info::device::max_clock_frequency>();
     WRITE_ATTR("max_clock_frequency", max_clock_frequency)
 
     GET_NUM_ATTR(address_bits)
@@ -837,7 +816,7 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p
     std::string name = device.get_info<sycl::info::device::name>();
     std::string id = "ONEAPI_" + platform_name + "_" + name;
     if (device.has(sycl::aspect::ext_intel_pci_address)) {
-      id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
+      id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
     }
     (cb)(id.c_str(), name.c_str(), num, user_ptr);
     num++;
@@ -855,7 +834,7 @@ int OneapiDevice::get_num_multiprocessors()
 {
   const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
   if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
-    return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+    return device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
   }
   else
     return 0;
@@ -866,8 +845,8 @@ int OneapiDevice::get_max_num_threads_per_multiprocessor()
   const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
   if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
       device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
-    return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
-           device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+    return device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() *
+           device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
   }
   else
     return 0;
diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h
index 62034150eac..197cf03d60d 100644
--- a/intern/cycles/device/oneapi/device_impl.h
+++ b/intern/cycles/device/oneapi/device_impl.h
@@ -3,7 +3,7 @@
 
 #ifdef WITH_ONEAPI
 
-#  include <CL/sycl.hpp>
+#  include <sycl/sycl.hpp>
 
 #  include "device/device.h"
 #  include "device/oneapi/device.h"
diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp
index 9632b14d485..3d019661aa8 100644
--- a/intern/cycles/device/oneapi/queue.cpp
+++ b/intern/cycles/device/oneapi/queue.cpp
@@ -43,7 +43,7 @@ int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
   return num_states;
 }
 
-int OneapiDeviceQueue::num_concurrent_busy_states() const
+int OneapiDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
 {
   const int max_num_threads = oneapi_device_->get_num_multiprocessors() *
                               oneapi_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h
index 32363bf2a6e..bbd947b49cb 100644
--- a/intern/cycles/device/oneapi/queue.h
+++ b/intern/cycles/device/oneapi/queue.h
@@ -25,7 +25,7 @@ class OneapiDeviceQueue : public DeviceQueue {
 
   virtual int num_concurrent_states(const size_t state_size) const override;
 
-  virtual int num_concurrent_busy_states() const override;
+  virtual int num_concurrent_busy_states(const size_t state_size) const override;
 
   virtual void init_execution() override;
 
diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h
index 1d6a8d736b7..e27e081a407 100644
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -103,7 +103,7 @@ class DeviceQueue {
   /* Number of states which keeps the device occupied with work without losing performance.
    * The renderer will add more work (when available) when number of active paths falls below this
    * value. */
-  virtual int num_concurrent_busy_states() const = 0;
+  virtual int num_concurrent_busy_states(const size_t state_size) const = 0;
 
   /* Number of elements in a partition of sorted shaders, that improves memory locality of
    * integrator state fetch at the cost of decreased coherence for shader kernel execution. */