50 files changed, 508 insertions, 439 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index f619e6b104e..329aa3990f6 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -263,8 +263,7 @@ if(WITH_CYCLES_DEVICE_OPTIX)
       ${OPTIX_INCLUDE_DIR}
       )
   else()
-    message(STATUS "OptiX not found, disabling it from Cycles")
-    set(WITH_CYCLES_DEVICE_OPTIX OFF)
+    set_and_warn_library_found("OptiX" OPTIX_FOUND WITH_CYCLES_DEVICE_OPTIX)
   endif()
 endif()
 
@@ -387,8 +386,7 @@ if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_C_COMPILER_ID MATCHES "Clang")
 endif()
 
 if(WITH_CYCLES_HYDRA_RENDER_DELEGATE AND (NOT WITH_USD))
-  message(STATUS "USD not found, disabling WITH_CYCLES_HYDRA_RENDER_DELEGATE")
-  set(WITH_CYCLES_HYDRA_RENDER_DELEGATE OFF)
+  set_and_warn_library_found("USD" WITH_USD WITH_CYCLES_HYDRA_RENDER_DELEGATE)
 endif()
 if(WITH_CYCLES_HYDRA_RENDER_DELEGATE AND (NOT WITH_BLENDER) AND (NOT WITH_CYCLES_STANDALONE))
   set(CYCLES_INSTALL_PATH ${CYCLES_INSTALL_PATH}/hdCycles/resources)
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index 0988b1c0ac4..1c7a861ea93 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -43,7 +43,10 @@ else()
 endif()
 
 if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
-  list(APPEND INC_SYS ${Epoxy_INCLUDE_DIRS} ${SDL2_INCLUDE_DIRS})
+  list(APPEND INC_SYS
+    ${Epoxy_INCLUDE_DIRS}
+    ${SDL2_INCLUDE_DIRS}
+  )
   list(APPEND LIB ${Epoxy_LIBRARIES} ${SDL2_LIBRARIES})
 endif()
 
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 794338fe78e..e33891fa7a2 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -209,22 +209,25 @@ def list_render_passes(scene, srl):
         yield ("Debug Sample Count", "X", 'VALUE')
 
     # Cryptomatte passes.
-    crypto_depth = (srl.pass_cryptomatte_depth + 1) // 2
+    # NOTE: Name channels are lowercase RGBA so that compression rules check in OpenEXR DWA code
+    # uses lossless compression. Reportedly this naming is the only one which works good from the
+    # interoperability point of view. Using XYZW naming is not portable.
+    crypto_depth = (min(16, srl.pass_cryptomatte_depth) + 1) // 2
     if srl.use_pass_cryptomatte_object:
         for i in range(0, crypto_depth):
-            yield ("CryptoObject" + '{:02d}'.format(i), "RGBA", 'COLOR')
+            yield ("CryptoObject" + '{:02d}'.format(i), "rgba", 'COLOR')
     if srl.use_pass_cryptomatte_material:
         for i in range(0, crypto_depth):
-            yield ("CryptoMaterial" + '{:02d}'.format(i), "RGBA", 'COLOR')
+            yield ("CryptoMaterial" + '{:02d}'.format(i), "rgba", 'COLOR')
     if srl.use_pass_cryptomatte_asset:
         for i in range(0, crypto_depth):
-            yield ("CryptoAsset" + '{:02d}'.format(i), "RGBA", 'COLOR')
+            yield ("CryptoAsset" + '{:02d}'.format(i), "rgba", 'COLOR')
 
     # Denoising passes.
     if scene.cycles.use_denoising and crl.use_denoising:
         yield ("Noisy Image", "RGBA", 'COLOR')
         if crl.use_pass_shadow_catcher:
-            yield ("Noisy Shadow Catcher", "RGBA", 'COLOR')
+            yield ("Noisy Shadow Catcher", "RGB", 'COLOR')
     if crl.denoising_store_passes:
         yield ("Denoising Normal", "XYZ", 'VECTOR')
         yield ("Denoising Albedo", "RGB", 'COLOR')
@@ -232,6 +235,8 @@ def list_render_passes(scene, srl):
 
     # Custom AOV passes.
     for aov in srl.aovs:
+        if not aov.is_valid:
+            continue
         if aov.type == 'VALUE':
             yield (aov.name, "X", 'VALUE')
         else:
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index b7ce76d8f44..425d123e9e6 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1651,6 +1651,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
             box.prop(
                 device, "use", text=device.name
                 .replace('(TM)', unicodedata.lookup('TRADE MARK SIGN'))
+                .replace('(tm)', unicodedata.lookup('TRADE MARK SIGN'))
                 .replace('(R)', unicodedata.lookup('REGISTERED SIGN'))
                 .replace('(C)', unicodedata.lookup('COPYRIGHT SIGN'))
             )
diff --git a/intern/cycles/blender/pointcloud.cpp b/intern/cycles/blender/pointcloud.cpp
index 35be2916e43..a679a92d997 100644
--- a/intern/cycles/blender/pointcloud.cpp
+++ b/intern/cycles/blender/pointcloud.cpp
@@ -194,7 +194,7 @@ static void export_pointcloud(Scene *scene,
   /* Export points. */
   for (int i = 0; i < num_points; i++) {
     const float3 co = get_float3(b_attr_position.data[i].vector());
-    const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.0f;
+    const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.01f;
     pointcloud->add_point(co, radius);
 
     /* Random number per point. */
@@ -232,7 +232,7 @@ static void export_pointcloud_motion(PointCloud *pointcloud,
 
   for (int i = 0; i < std::min(num_points, b_points_num); i++) {
     const float3 co = get_float3(b_attr_position.data[i].vector());
-    const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.0f;
+    const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.01f;
     float3 P = co;
     P.w = radius;
     mP[i] = P;
diff --git a/intern/cycles/blender/shader.cpp b/intern/cycles/blender/shader.cpp
index fd32e7ca1d7..dbc49df7f22 100644
--- a/intern/cycles/blender/shader.cpp
+++ b/intern/cycles/blender/shader.cpp
@@ -215,7 +215,9 @@ static void set_default_value(ShaderInput *input,
     }
     case SocketType::INT: {
       if (b_sock.type() == BL::NodeSocket::type_BOOLEAN) {
-        node->set(socket, get_boolean(b_sock.ptr, "default_value"));
+        /* Make sure to call the int overload of set() since this is an integer socket as far as
+         * Cycles is concerned. */
+        node->set(socket, get_boolean(b_sock.ptr, "default_value") ? 1 : 0);
       }
       else {
         node->set(socket, get_int(b_sock.ptr, "default_value"));
diff --git a/intern/cycles/blender/sync.cpp b/intern/cycles/blender/sync.cpp
index a69a94614d3..5251f0fee9c 100644
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -575,68 +575,72 @@ void BlenderSync::sync_images()
 
 /* Passes */
 
-static PassType get_blender_pass_type(BL::RenderPass &b_pass)
+static bool get_known_pass_type(BL::RenderPass &b_pass, PassType &type, PassMode &mode)
 {
   string name = b_pass.name();
-#define MAP_PASS(passname, passtype) \
+#define MAP_PASS(passname, passtype, noisy) \
   if (name == passname) { \
-    return passtype; \
+    type = passtype; \
+    mode = (noisy) ? PassMode::NOISY : PassMode::DENOISED; \
+    return true; \
   } \
   ((void)0)
 
-  /* NOTE: Keep in sync with defined names from DNA_scene_types.h */
+  /* NOTE: Keep in sync with defined names from engine.py */
 
-  MAP_PASS("Combined", PASS_COMBINED);
-  MAP_PASS("Noisy Image", PASS_COMBINED);
+  MAP_PASS("Combined", PASS_COMBINED, false);
+  MAP_PASS("Noisy Image", PASS_COMBINED, true);
 
-  MAP_PASS("Depth", PASS_DEPTH);
-  MAP_PASS("Mist", PASS_MIST);
-  MAP_PASS("Position", PASS_POSITION);
-  MAP_PASS("Normal", PASS_NORMAL);
-  MAP_PASS("IndexOB", PASS_OBJECT_ID);
-  MAP_PASS("UV", PASS_UV);
-  MAP_PASS("Vector", PASS_MOTION);
-  MAP_PASS("IndexMA", PASS_MATERIAL_ID);
+  MAP_PASS("Depth", PASS_DEPTH, false);
+  MAP_PASS("Mist", PASS_MIST, false);
+  MAP_PASS("Position", PASS_POSITION, false);
+  MAP_PASS("Normal", PASS_NORMAL, false);
+  MAP_PASS("IndexOB", PASS_OBJECT_ID, false);
+  MAP_PASS("UV", PASS_UV, false);
+  MAP_PASS("Vector", PASS_MOTION, false);
+  MAP_PASS("IndexMA", PASS_MATERIAL_ID, false);
 
-  MAP_PASS("DiffDir", PASS_DIFFUSE_DIRECT);
-  MAP_PASS("GlossDir", PASS_GLOSSY_DIRECT);
-  MAP_PASS("TransDir", PASS_TRANSMISSION_DIRECT);
-  MAP_PASS("VolumeDir", PASS_VOLUME_DIRECT);
+  MAP_PASS("DiffDir", PASS_DIFFUSE_DIRECT, false);
+  MAP_PASS("GlossDir", PASS_GLOSSY_DIRECT, false);
+  MAP_PASS("TransDir", PASS_TRANSMISSION_DIRECT, false);
+  MAP_PASS("VolumeDir", PASS_VOLUME_DIRECT, false);
 
-  MAP_PASS("DiffInd", PASS_DIFFUSE_INDIRECT);
-  MAP_PASS("GlossInd", PASS_GLOSSY_INDIRECT);
-  MAP_PASS("TransInd", PASS_TRANSMISSION_INDIRECT);
-  MAP_PASS("VolumeInd", PASS_VOLUME_INDIRECT);
+  MAP_PASS("DiffInd", PASS_DIFFUSE_INDIRECT, false);
+  MAP_PASS("GlossInd", PASS_GLOSSY_INDIRECT, false);
+  MAP_PASS("TransInd", PASS_TRANSMISSION_INDIRECT, false);
+  MAP_PASS("VolumeInd", PASS_VOLUME_INDIRECT, false);
 
-  MAP_PASS("DiffCol", PASS_DIFFUSE_COLOR);
-  MAP_PASS("GlossCol", PASS_GLOSSY_COLOR);
-  MAP_PASS("TransCol", PASS_TRANSMISSION_COLOR);
+  MAP_PASS("DiffCol", PASS_DIFFUSE_COLOR, false);
+  MAP_PASS("GlossCol", PASS_GLOSSY_COLOR, false);
+  MAP_PASS("TransCol", PASS_TRANSMISSION_COLOR, false);
 
-  MAP_PASS("Emit", PASS_EMISSION);
-  MAP_PASS("Env", PASS_BACKGROUND);
-  MAP_PASS("AO", PASS_AO);
-  MAP_PASS("Shadow", PASS_SHADOW);
+  MAP_PASS("Emit", PASS_EMISSION, false);
+  MAP_PASS("Env", PASS_BACKGROUND, false);
+  MAP_PASS("AO", PASS_AO, false);
+  MAP_PASS("Shadow", PASS_SHADOW, false);
 
-  MAP_PASS("BakePrimitive", PASS_BAKE_PRIMITIVE);
-  MAP_PASS("BakeDifferential", PASS_BAKE_DIFFERENTIAL);
+  MAP_PASS("BakePrimitive", PASS_BAKE_PRIMITIVE, false);
+  MAP_PASS("BakeDifferential", PASS_BAKE_DIFFERENTIAL, false);
 
-  MAP_PASS("Denoising Normal", PASS_DENOISING_NORMAL);
-  MAP_PASS("Denoising Albedo", PASS_DENOISING_ALBEDO);
-  MAP_PASS("Denoising Depth", PASS_DENOISING_DEPTH);
+  MAP_PASS("Denoising Normal", PASS_DENOISING_NORMAL, true);
+  MAP_PASS("Denoising Albedo", PASS_DENOISING_ALBEDO, true);
+  MAP_PASS("Denoising Depth", PASS_DENOISING_DEPTH, true);
 
-  MAP_PASS("Shadow Catcher", PASS_SHADOW_CATCHER);
-  MAP_PASS("Noisy Shadow Catcher", PASS_SHADOW_CATCHER);
+  MAP_PASS("Shadow Catcher", PASS_SHADOW_CATCHER, false);
+  MAP_PASS("Noisy Shadow Catcher", PASS_SHADOW_CATCHER, true);
 
-  MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER);
-  MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT);
+  MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER, false);
+  MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT, false);
 
   if (string_startswith(name, cryptomatte_prefix)) {
-    return PASS_CRYPTOMATTE;
+    type = PASS_CRYPTOMATTE;
+    mode = PassMode::DENOISED;
+    return true;
   }
 
 #undef MAP_PASS
 
-  return PASS_NONE;
+  return false;
 }
 
 static Pass *pass_add(Scene *scene,
@@ -655,8 +659,6 @@ static Pass *pass_add(Scene *scene,
 
 void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer)
 {
-  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
-
   /* Delete all existing passes. */
   set<Pass *> clear_passes(scene->passes.begin(), scene->passes.end());
   scene->delete_nodes(clear_passes);
@@ -664,103 +666,23 @@ void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_v
   /* Always add combined pass. */
   pass_add(scene, PASS_COMBINED, "Combined");
 
-  /* Blender built-in data and light passes. */
-  for (BL::RenderPass &b_pass : b_rlay.passes) {
-    const PassType pass_type = get_blender_pass_type(b_pass);
-
-    if (pass_type == PASS_NONE) {
-      LOG(ERROR) << "Unknown pass " << b_pass.name();
-      continue;
-    }
-
-    if (pass_type == PASS_MOTION &&
-        (b_view_layer.use_motion_blur() && b_scene.render().use_motion_blur())) {
-      continue;
-    }
-
-    pass_add(scene, pass_type, b_pass.name().c_str());
-  }
-
-  PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
-
-  /* Debug passes. */
-  if (get_boolean(crl, "pass_debug_sample_count")) {
-    b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str());
-    pass_add(scene, PASS_SAMPLE_COUNT, "Debug Sample Count");
-  }
-
-  /* Cycles specific passes. */
-  if (get_boolean(crl, "use_pass_volume_direct")) {
-    b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str());
-    pass_add(scene, PASS_VOLUME_DIRECT, "VolumeDir");
-  }
-  if (get_boolean(crl, "use_pass_volume_indirect")) {
-    b_engine.add_pass("VolumeInd", 3, "RGB", b_view_layer.name().c_str());
-    pass_add(scene, PASS_VOLUME_INDIRECT, "VolumeInd");
-  }
-  if (get_boolean(crl, "use_pass_shadow_catcher")) {
-    b_engine.add_pass("Shadow Catcher", 3, "RGB", b_view_layer.name().c_str());
-    pass_add(scene, PASS_SHADOW_CATCHER, "Shadow Catcher");
-  }
-
   /* Cryptomatte stores two ID/weight pairs per RGBA layer.
-   * User facing parameter is the number of pairs.
-   *
-   * NOTE: Name channels lowercase RGBA so that compression rules check in OpenEXR DWA code uses
-   * lossless compression. Reportedly this naming is the only one which works good from the
-   * interoperability point of view. Using XYZW naming is not portable. */
+   * User facing parameter is the number of pairs. */
   int crypto_depth = divide_up(min(16, b_view_layer.pass_cryptomatte_depth()), 2);
   scene->film->set_cryptomatte_depth(crypto_depth);
   CryptomatteType cryptomatte_passes = CRYPT_NONE;
   if (b_view_layer.use_pass_cryptomatte_object()) {
-    for (int i = 0; i < crypto_depth; i++) {
-      string passname = cryptomatte_prefix + string_printf("Object%02d", i);
-      b_engine.add_pass(passname.c_str(), 4, "rgba", b_view_layer.name().c_str());
-      pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
-    }
     cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_OBJECT);
   }
   if (b_view_layer.use_pass_cryptomatte_material()) {
-    for (int i = 0; i < crypto_depth; i++) {
-      string passname = cryptomatte_prefix + string_printf("Material%02d", i);
-      b_engine.add_pass(passname.c_str(), 4, "rgba", b_view_layer.name().c_str());
-      pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
-    }
     cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_MATERIAL);
   }
   if (b_view_layer.use_pass_cryptomatte_asset()) {
-    for (int i = 0; i < crypto_depth; i++) {
-      string passname = cryptomatte_prefix + string_printf("Asset%02d", i);
-      b_engine.add_pass(passname.c_str(), 4, "rgba", b_view_layer.name().c_str());
-      pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
-    }
     cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_ASSET);
   }
   scene->film->set_cryptomatte_passes(cryptomatte_passes);
 
-  /* Denoising passes. */
-  const bool use_denoising = get_boolean(cscene, "use_denoising") &&
-                             get_boolean(crl, "use_denoising");
-  const bool store_denoising_passes = get_boolean(crl, "denoising_store_passes");
-  if (use_denoising) {
-    b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str());
-    pass_add(scene, PASS_COMBINED, "Noisy Image", PassMode::NOISY);
-    if (get_boolean(crl, "use_pass_shadow_catcher")) {
-      b_engine.add_pass("Noisy Shadow Catcher", 3, "RGB", b_view_layer.name().c_str());
-      pass_add(scene, PASS_SHADOW_CATCHER, "Noisy Shadow Catcher", PassMode::NOISY);
-    }
-  }
-  if (store_denoising_passes) {
-    b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str());
-    pass_add(scene, PASS_DENOISING_NORMAL, "Denoising Normal", PassMode::NOISY);
-
-    b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str());
-    pass_add(scene, PASS_DENOISING_ALBEDO, "Denoising Albedo", PassMode::NOISY);
-
-    b_engine.add_pass("Denoising Depth", 1, "Z", b_view_layer.name().c_str());
-    pass_add(scene, PASS_DENOISING_DEPTH, "Denoising Depth", PassMode::NOISY);
-  }
-
+  /* Path guiding debug passes. */
 #ifdef WITH_CYCLES_DEBUG
   b_engine.add_pass("Guiding Color", 3, "RGB", b_view_layer.name().c_str());
   pass_add(scene, PASS_GUIDING_COLOR, "Guiding Color", PassMode::NOISY);
@@ -772,6 +694,8 @@ void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_v
   pass_add(scene, PASS_GUIDING_AVG_ROUGHNESS, "Guiding Average Roughness", PassMode::NOISY);
 #endif
 
+  unordered_set<string> expected_passes;
+
   /* Custom AOV passes. */
   BL::ViewLayer::aovs_iterator b_aov_iter;
   for (b_view_layer.aovs.begin(b_aov_iter); b_aov_iter != b_view_layer.aovs.end(); ++b_aov_iter) {
@@ -781,16 +705,10 @@ void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_v
     }
 
     string name = b_aov.name();
-    bool is_color = b_aov.type() == BL::AOV::type_COLOR;
+    PassType type = (b_aov.type() == BL::AOV::type_COLOR) ? PASS_AOV_COLOR : PASS_AOV_VALUE;
 
-    if (is_color) {
-      b_engine.add_pass(name.c_str(), 4, "RGBA", b_view_layer.name().c_str());
-      pass_add(scene, PASS_AOV_COLOR, name.c_str());
-    }
-    else {
-      b_engine.add_pass(name.c_str(), 1, "X", b_view_layer.name().c_str());
-      pass_add(scene, PASS_AOV_VALUE, name.c_str());
-    }
+    pass_add(scene, type, name.c_str());
+    expected_passes.insert(name);
   }
 
   /* Light Group passes. */
@@ -802,9 +720,29 @@ void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_v
 
     string name = string_printf("Combined_%s", b_lightgroup.name().c_str());
 
-    b_engine.add_pass(name.c_str(), 3, "RGB", b_view_layer.name().c_str());
     Pass *pass = pass_add(scene, PASS_COMBINED, name.c_str(), PassMode::NOISY);
     pass->set_lightgroup(ustring(b_lightgroup.name()));
+    expected_passes.insert(name);
+  }
+
+  /* Sync the passes that were defined in engine.py. */
+  for (BL::RenderPass &b_pass : b_rlay.passes) {
+    PassType pass_type = PASS_NONE;
+    PassMode pass_mode = PassMode::DENOISED;
+
+    if (!get_known_pass_type(b_pass, pass_type, pass_mode)) {
+      if (!expected_passes.count(b_pass.name())) {
+        LOG(ERROR) << "Unknown pass " << b_pass.name();
+      }
+      continue;
+    }
+
+    if (pass_type == PASS_MOTION &&
+        (b_view_layer.use_motion_blur() && b_scene.render().use_motion_blur())) {
+      continue;
+    }
+
+    pass_add(scene, pass_type, b_pass.name().c_str(), pass_mode);
   }
 
   scene->film->set_pass_alpha_threshold(b_view_layer.pass_alpha_threshold());
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index 9524cda54f5..44542a08156 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -289,8 +289,7 @@ if(CYCLES_STANDALONE_REPOSITORY AND WITH_CYCLES_PATH_GUIDING)
     endif()
     get_target_property(OPENPGL_INCLUDE_DIR openpgl::openpgl INTERFACE_INCLUDE_DIRECTORIES)
   else()
-    set(WITH_CYCLES_PATH_GUIDING OFF)
-    message(STATUS "OpenPGL not found, disabling WITH_CYCLES_PATH_GUIDING")
+    set_and_warn_library_found("OpenPGL" openpgl_FOUND WITH_CYCLES_PATH_GUIDING)
   endif()
 endif()
 
@@ -588,16 +587,14 @@ if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI)
   # We can't use the version from the Blender precompiled libraries because
   # it does not include the video subsystem.
   find_package(SDL2 REQUIRED)
+  set_and_warn_library_found("SDL" SDL2_FOUND WITH_CYCLES_STANDALONE_GUI)
 
-  if(NOT SDL2_FOUND)
-    set(WITH_CYCLES_STANDALONE_GUI OFF)
-    message(STATUS "SDL not found, disabling Cycles standalone GUI")
+  if(SDL2_FOUND)
+    include_directories(
+      SYSTEM
+      ${SDL2_INCLUDE_DIRS}
+    )
   endif()
-
-  include_directories(
-    SYSTEM
-    ${SDL2_INCLUDE_DIRS}
-  )
 endif()
 
 ###########################################################################
@@ -606,11 +603,11 @@ endif()
 
 if(WITH_CYCLES_DEVICE_CUDA AND (WITH_CYCLES_CUDA_BINARIES OR NOT WITH_CUDA_DYNLOAD))
   find_package(CUDA) # Try to auto locate CUDA toolkit
+  set_and_warn_library_found("CUDA compiler" CUDA_FOUND WITH_CYCLES_CUDA_BINARIES)
+
   if(CUDA_FOUND)
     message(STATUS "Found CUDA ${CUDA_NVCC_EXECUTABLE} (${CUDA_VERSION})")
   else()
-    message(STATUS "CUDA compiler not found, disabling WITH_CYCLES_CUDA_BINARIES")
-    set(WITH_CYCLES_CUDA_BINARIES OFF)
     if(NOT WITH_CUDA_DYNLOAD)
       message(STATUS "Additionally falling back to dynamic CUDA load")
       set(WITH_CUDA_DYNLOAD ON)
@@ -624,11 +621,10 @@ endif()
 
 if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
   find_package(HIP)
+  set_and_warn_library_found("HIP compiler" HIP_FOUND WITH_CYCLES_HIP_BINARIES)
+
   if(HIP_FOUND)
     message(STATUS "Found HIP ${HIP_HIPCC_EXECUTABLE} (${HIP_VERSION})")
-  else()
-    message(STATUS "HIP compiler not found, disabling WITH_CYCLES_HIP_BINARIES")
-    set(WITH_CYCLES_HIP_BINARIES OFF)
   endif()
 endif()
 
@@ -644,13 +640,17 @@ if(WITH_CYCLES_DEVICE_METAL)
   find_library(METAL_LIBRARY Metal)
 
   # This file was added in the 12.0 SDK, use it as a way to detect the version.
-  if(METAL_LIBRARY AND NOT EXISTS "${METAL_LIBRARY}/Headers/MTLFunctionStitching.h")
-    message(STATUS "Metal version too old, must be SDK 12.0 or newer, disabling WITH_CYCLES_DEVICE_METAL")
-    set(WITH_CYCLES_DEVICE_METAL OFF)
-  elseif(NOT METAL_LIBRARY)
-    message(STATUS "Metal not found, disabling WITH_CYCLES_DEVICE_METAL")
-    set(WITH_CYCLES_DEVICE_METAL OFF)
-  else()
+  if(METAL_LIBRARY)
+    if(EXISTS "${METAL_LIBRARY}/Headers/MTLFunctionStitching.h")
+      set(METAL_FOUND ON)
+    else()
+      message(STATUS "Metal version too old, must be SDK 12.0 or newer")
+      set(METAL_FOUND OFF)
+    endif()
+  endif()
+
+  set_and_warn_library_found("Metal" METAL_FOUND WITH_CYCLES_DEVICE_METAL)
+  if(METAL_FOUND)
     message(STATUS "Found Metal: ${METAL_LIBRARY}")
   endif()
 endif()
@@ -662,9 +662,10 @@ endif()
 if(WITH_CYCLES_DEVICE_ONEAPI)
   find_package(SYCL)
   find_package(LevelZero)
+  set_and_warn_library_found("oneAPI" SYCL_FOUND WITH_CYCLES_DEVICE_ONEAPI)
+  set_and_warn_library_found("Level Zero" LEVEL_ZERO_FOUND WITH_CYCLES_DEVICE_ONEAPI)
 
-  if(SYCL_FOUND AND LEVEL_ZERO_FOUND)
-    message(STATUS "Found oneAPI: ${SYCL_LIBRARY}")
+  if(SYCL_FOUND AND SYCL_VERSION VERSION_GREATER_EQUAL 6.0 AND LEVEL_ZERO_FOUND)
     message(STATUS "Found Level Zero: ${LEVEL_ZERO_LIBRARY}")
 
     if(WITH_CYCLES_ONEAPI_BINARIES)
@@ -675,13 +676,14 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
       endif()
 
       if(NOT EXISTS ${OCLOC_INSTALL_DIR})
-        message(STATUS "oneAPI ocloc not found in ${OCLOC_INSTALL_DIR}, disabling WITH_CYCLES_ONEAPI_BINARIES."
+        set(OCLOC_FOUND OFF)
+        message(STATUS "oneAPI ocloc not found in ${OCLOC_INSTALL_DIR}."
                        " A different ocloc directory can be set using OCLOC_INSTALL_DIR cmake variable.")
-        set(WITH_CYCLES_ONEAPI_BINARIES OFF)
+        set_and_warn_library_found("ocloc" OCLOC_FOUND WITH_CYCLES_ONEAPI_BINARIES)
       endif()
     endif()
   else()
-    message(STATUS "oneAPI or Level Zero not found, disabling WITH_CYCLES_DEVICE_ONEAPI")
+    message(STATUS "SYCL 6.0+ or Level Zero not found, disabling WITH_CYCLES_DEVICE_ONEAPI")
     set(WITH_CYCLES_DEVICE_ONEAPI OFF)
   endif()
 endif()
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 5516e97f34f..5296d819e42 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -187,18 +187,22 @@ if(WITH_CYCLES_DEVICE_METAL)
   )
 endif()
 if (WITH_CYCLES_DEVICE_ONEAPI)
+  if(WITH_CYCLES_ONEAPI_BINARIES)
+    set(cycles_kernel_oneapi_lib_suffix "_aot")
+  else()
+    set(cycles_kernel_oneapi_lib_suffix "_jit")
+  endif()
   if(WIN32)
-    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi.lib)
+    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/cycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.lib)
   else()
-    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi.so)
+    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/../kernel/libcycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.so)
+  endif()
+  list(APPEND LIB ${cycles_kernel_oneapi_lib})
+  if(WIN32)
+    list(APPEND LIB debug ${SYCL_LIBRARY_DEBUG} optimized ${SYCL_LIBRARY})
+  else()
+    list(APPEND LIB ${SYCL_LIBRARY})
   endif()
-  list(APPEND LIB
-    ${cycles_kernel_oneapi_lib}
-    "$<$<CONFIG:Debug>:${SYCL_LIBRARY_DEBUG}>"
-    "$<$<CONFIG:Release>:${SYCL_LIBRARY}>"
-    "$<$<CONFIG:RelWithDebInfo>:${SYCL_LIBRARY}>"
-    "$<$<CONFIG:MinSizeRel>:${SYCL_LIBRARY}>"
-  )
   add_definitions(-DWITH_ONEAPI)
   list(APPEND SRC
     ${SRC_ONEAPI}
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index 84b0a1e0dd6..69fae03e32c 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -49,7 +49,7 @@ int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
   return num_states;
 }
 
-int CUDADeviceQueue::num_concurrent_busy_states() const
+int CUDADeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
 {
   const int max_num_threads = cuda_device_->get_num_multiprocessors() *
                               cuda_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
index b450f5b3592..7107afe70c9 100644
--- a/intern/cycles/device/cuda/queue.h
+++ b/intern/cycles/device/cuda/queue.h
@@ -23,7 +23,7 @@ class CUDADeviceQueue : public DeviceQueue {
   ~CUDADeviceQueue();
 
   virtual int num_concurrent_states(const size_t state_size) const override;
-  virtual int num_concurrent_busy_states() const override;
+  virtual int num_concurrent_busy_states(const size_t state_size) const override;
 
   virtual void init_execution() override;
 
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
index 3f8b6267100..e93a9b4df3a 100644
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@@ -49,7 +49,7 @@ int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const
   return num_states;
 }
 
-int HIPDeviceQueue::num_concurrent_busy_states() const
+int HIPDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
 {
   const int max_num_threads = hip_device_->get_num_multiprocessors() *
                               hip_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h
index 729d8a19acb..df0678108af 100644
--- a/intern/cycles/device/hip/queue.h
+++ b/intern/cycles/device/hip/queue.h
@@ -23,7 +23,7 @@ class HIPDeviceQueue : public DeviceQueue {
   ~HIPDeviceQueue();
 
   virtual int num_concurrent_states(const size_t state_size) const override;
-  virtual int num_concurrent_busy_states() const override;
+  virtual int num_concurrent_busy_states(const size_t state_size) const override;
 
   virtual void init_execution() override;
 
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index d1250b83d22..6f1042b1e55 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -254,6 +254,10 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
       break;
   }
 
+  NSProcessInfo *processInfo = [NSProcessInfo processInfo];
+  NSOperatingSystemVersion macos_ver = [processInfo operatingSystemVersion];
+  global_defines += "#define __KERNEL_METAL_MACOS__ " + to_string(macos_ver.majorVersion) + "\n";
+
   string &source = this->source[pso_type];
   source = "\n#include \"kernel/device/metal/kernel.metal\"\n";
   source = path_source_replace_includes(source, path_get("source"));
@@ -292,9 +296,11 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
   }
 
   source = global_defines + source;
+#  if 0
   metal_printf("================\n%s================\n\%s================\n",
                global_defines.c_str(),
                baked_constants.c_str());
+#  endif
 
   /* Generate an MD5 from the source and include any baked constants. This is used when caching
    * PSOs. */
@@ -335,6 +341,14 @@ bool MetalDevice::compile_and_load(MetalPipelineType pso_type)
 
   MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
 
+#  if defined(MAC_OS_VERSION_13_0)
+  if (@available(macos 13.0, *)) {
+    if (device_vendor == METAL_GPU_INTEL) {
+      [options setOptimizationLevel:MTLLibraryOptimizationLevelSize];
+    }
+  }
+#  endif
+
   options.fastMathEnabled = YES;
   if (@available(macOS 12.0, *)) {
     options.languageVersion = MTLLanguageVersion2_4;
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index 5e0cb6d18f4..55938d1a03a 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -162,6 +162,13 @@ bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
     }
   }
 
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
+    if ((device->kernel_features & KERNEL_FEATURE_MNEE) == 0) {
+      /* Skip shade_surface_mnee kernel if the scene doesn't require it. */
+      return false;
+    }
+  }
+
   if (pso_type != PSO_GENERIC) {
     /* Only specialize kernels where it can make an impact. */
     if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
@@ -317,6 +324,12 @@ bool MetalKernelPipeline::should_use_binary_archive() const
       }
     }
 
+    /* Workaround for Intel GPU having issue using Binary Archives */
+    MetalGPUVendor gpu_vendor = MetalInfo::get_device_vendor(mtlDevice);
+    if (gpu_vendor == METAL_GPU_INTEL) {
+      return false;
+    }
+
     if (pso_type == PSO_GENERIC) {
       /* Archive the generic kernels. */
       return true;
diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index fc32740f3e1..2a6c12e2a60 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -23,7 +23,7 @@ class MetalDeviceQueue : public DeviceQueue {
   ~MetalDeviceQueue();
 
   virtual int num_concurrent_states(const size_t) const override;
-  virtual int num_concurrent_busy_states() const override;
+  virtual int num_concurrent_busy_states(const size_t) const override;
   virtual int num_sort_partition_elements() const override;
 
   virtual void init_execution() override;
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index 5ac63a16c61..c0df2c8553f 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -264,33 +264,46 @@ MetalDeviceQueue::~MetalDeviceQueue()
   }
 }
 
-int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const
 {
-  /* METAL_WIP */
-  /* TODO: compute automatically. */
-  /* TODO: must have at least num_threads_per_block. */
-  int result = 1048576;
-  if (metal_device_->device_vendor == METAL_GPU_AMD) {
-    result *= 2;
+  static int result = 0;
+  if (result) {
+    return result;
   }
-  else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
+
+  result = 1048576;
+  if (metal_device_->device_vendor == METAL_GPU_APPLE) {
     result *= 4;
+
+    if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) == APPLE_M2) {
+      size_t system_ram = system_physical_ram();
+      size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize];
+      size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize];
+
+      /* Determine whether we can double the state count, and leave enough GPU-available memory
+       * (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to
+       * keep dispatch sizes high and minimize work submission overheads. */
+      size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024));
+      size_t total_state_size = result * state_size;
+      if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) {
+        result *= 2;
+        metal_printf("Doubling state count to exploit available RAM (new size = %d)\n", result);
+      }
+    }
+  }
+  else if (metal_device_->device_vendor == METAL_GPU_AMD) {
+    /* METAL_WIP */
+    /* TODO: compute automatically. */
+    /* TODO: must have at least num_threads_per_block. */
+    result *= 2;
   }
   return result;
 }
 
-int MetalDeviceQueue::num_concurrent_busy_states() const
+int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const
 {
-  /* METAL_WIP */
-  /* TODO: compute automatically. */
-  int result = 65536;
-  if (metal_device_->device_vendor == METAL_GPU_AMD) {
-    result *= 2;
-  }
-  else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
-    result *= 4;
-  }
-  return result;
+  /* A 1:4 busy:total ratio gives best rendering performance, independent of total state count. */
+  return num_concurrent_states(state_size) / 4;
 }
 
 int MetalDeviceQueue::num_sort_partition_elements() const
diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm
index 65c67c400fe..f47638fac15 100644
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -110,6 +110,12 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
       usable |= (vendor == METAL_GPU_AMD);
     }
 
+#  if defined(MAC_OS_VERSION_13_0)
+    if (@available(macos 13.0, *)) {
+      usable |= (vendor == METAL_GPU_INTEL);
+    }
+#  endif
+
     if (usable) {
       metal_printf("- %s\n", device_name.c_str());
       [device retain];
diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp
index f303ab41627..66d6f749e30 100644
--- a/intern/cycles/device/oneapi/device.cpp
+++ b/intern/cycles/device/oneapi/device.cpp
@@ -39,7 +39,7 @@ bool device_oneapi_init()
     _putenv_s("SYCL_CACHE_THRESHOLD", "0");
   }
   if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
-    _putenv_s("SYCL_DEVICE_FILTER", "host,level_zero");
+    _putenv_s("SYCL_DEVICE_FILTER", "level_zero");
   }
   if (getenv("SYCL_ENABLE_PCI") == nullptr) {
     _putenv_s("SYCL_ENABLE_PCI", "1");
@@ -50,7 +50,7 @@ bool device_oneapi_init()
 #  elif __linux__
   setenv("SYCL_CACHE_PERSISTENT", "1", false);
   setenv("SYCL_CACHE_THRESHOLD", "0", false);
-  setenv("SYCL_DEVICE_FILTER", "host,level_zero", false);
+  setenv("SYCL_DEVICE_FILTER", "level_zero", false);
   setenv("SYCL_ENABLE_PCI", "1", false);
   setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
 #  endif
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp
index 2df605fa047..3588b75713b 100644
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -43,7 +43,7 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi
   }
 
   size_t globals_segment_size;
-  is_finished_ok = kernel_globals_size(device_queue_, globals_segment_size);
+  is_finished_ok = kernel_globals_size(globals_segment_size);
   if (is_finished_ok == false) {
     set_error("oneAPI constant memory initialization got runtime exception \"" +
               oneapi_error_string_ + "\"");
@@ -88,18 +88,26 @@ BVHLayoutMask OneapiDevice::get_bvh_layout_mask() const
 bool OneapiDevice::load_kernels(const uint requested_features)
 {
   assert(device_queue_);
-  /* NOTE(@nsirgien): oneAPI can support compilation of kernel code with certain feature set
-   * with specialization constants, but it hasn't been implemented yet. */
-  (void)requested_features;
 
   bool is_finished_ok = oneapi_run_test_kernel(device_queue_);
   if (is_finished_ok == false) {
-    set_error("oneAPI kernel load: got runtime exception \"" + oneapi_error_string_ + "\"");
+    set_error("oneAPI test kernel execution: got a runtime exception \"" + oneapi_error_string_ +
+              "\"");
+    return false;
   }
   else {
-    VLOG_INFO << "Runtime compilation done for \"" << info.description << "\"";
+    VLOG_INFO << "Test kernel has been executed successfully for \"" << info.description << "\"";
     assert(device_queue_);
   }
+
+  is_finished_ok = oneapi_load_kernels(device_queue_, (const unsigned int)requested_features);
+  if (is_finished_ok == false) {
+    set_error("oneAPI kernels loading: got a runtime exception \"" + oneapi_error_string_ + "\"");
+  }
+  else {
+    VLOG_INFO << "Kernels loading (compilation) has been done for \"" << info.description << "\"";
+  }
+
   return is_finished_ok;
 }
 
@@ -422,9 +430,14 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_
   sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
   (void)usm_type;
   assert(usm_type == sycl::usm::alloc::device ||
-         ((device_type == sycl::info::device_type::host ||
-           device_type == sycl::info::device_type::cpu || allow_host) &&
-          usm_type == sycl::usm::alloc::host));
+         ((device_type == sycl::info::device_type::cpu || allow_host) &&
+              usm_type == sycl::usm::alloc::host ||
+          usm_type == sycl::usm::alloc::unknown));
+#  else
+  /* Silence warning about unused arguments. */
+  (void)queue_;
+  (void)usm_ptr;
+  (void)allow_host;
 #  endif
 }
 
@@ -552,7 +565,7 @@ bool OneapiDevice::queue_synchronize(SyclQueue *queue_)
   }
 }
 
-bool OneapiDevice::kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
+bool OneapiDevice::kernel_globals_size(size_t &kernel_global_size)
 {
   kernel_global_size = sizeof(KernelGlobalsGPU);
 
@@ -658,14 +671,6 @@ std::vector<sycl::device> OneapiDevice::available_devices()
   if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
     allow_all_devices = true;
 
-    /* Host device is useful only for debugging at the moment
-     * so we hide this device with default build settings. */
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-  bool allow_host = true;
-#  else
-  bool allow_host = false;
-#  endif
-
   const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
 
   std::vector<sycl::device> available_devices;
@@ -677,17 +682,11 @@ std::vector<sycl::device> OneapiDevice::available_devices()
     }
 
     const std::vector<sycl::device> &oneapi_devices =
-        (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
-                                            platform.get_devices(sycl::info::device_type::gpu);
+        (allow_all_devices) ? platform.get_devices(sycl::info::device_type::all) :
+                              platform.get_devices(sycl::info::device_type::gpu);
 
     for (const sycl::device &device : oneapi_devices) {
-      if (allow_all_devices) {
-        /* still filter out host device if build doesn't support it. */
-        if (allow_host || !device.is_host()) {
-          available_devices.push_back(device);
-        }
-      }
-      else {
+      if (!allow_all_devices) {
         bool filter_out = false;
 
         /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
@@ -699,11 +698,11 @@ std::vector<sycl::device> OneapiDevice::available_devices()
           int number_of_eus = 96;
           int threads_per_eu = 7;
           if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
-            number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+            number_of_eus = device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
           }
           if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
             threads_per_eu =
-                device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+                device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
           }
           /* This filters out all Level-Zero supported GPUs from older generation than Arc. */
           if (number_of_eus <= 96 && threads_per_eu == 7) {
@@ -719,9 +718,6 @@ std::vector<sycl::device> OneapiDevice::available_devices()
             }
           }
         }
-        else if (!allow_host && device.is_host()) {
-          filter_out = true;
-        }
         else if (!allow_all_devices) {
           filter_out = true;
         }
@@ -784,9 +780,7 @@ char *OneapiDevice::device_capabilities()
     GET_NUM_ATTR(native_vector_width_double)
     GET_NUM_ATTR(native_vector_width_half)
 
-    size_t max_clock_frequency =
-        (size_t)(device.is_host() ? (size_t)0 :
-                                    device.get_info<sycl::info::device::max_clock_frequency>());
+    size_t max_clock_frequency = device.get_info<sycl::info::device::max_clock_frequency>();
     WRITE_ATTR("max_clock_frequency", max_clock_frequency)
 
     GET_NUM_ATTR(address_bits)
@@ -824,7 +818,7 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p
     std::string name = device.get_info<sycl::info::device::name>();
     std::string id = "ONEAPI_" + platform_name + "_" + name;
     if (device.has(sycl::aspect::ext_intel_pci_address)) {
-      id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
+      id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
     }
     (cb)(id.c_str(), name.c_str(), num, user_ptr);
     num++;
@@ -842,7 +836,7 @@ int OneapiDevice::get_num_multiprocessors()
 {
   const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
   if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
-    return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+    return device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
   }
   else
     return 0;
@@ -853,8 +847,8 @@ int OneapiDevice::get_max_num_threads_per_multiprocessor()
   const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
   if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
       device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
-    return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
-           device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+    return device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() *
+           device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
   }
   else
     return 0;
diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h
index 3589e881a6e..197cf03d60d 100644
--- a/intern/cycles/device/oneapi/device_impl.h
+++ b/intern/cycles/device/oneapi/device_impl.h
@@ -3,7 +3,7 @@
 
 #ifdef WITH_ONEAPI
 
-#  include <CL/sycl.hpp>
+#  include <sycl/sycl.hpp>
 
 #  include "device/device.h"
 #  include "device/oneapi/device.h"
@@ -104,7 +104,7 @@ class OneapiDevice : public Device {
   int get_num_multiprocessors();
   int get_max_num_threads_per_multiprocessor();
   bool queue_synchronize(SyclQueue *queue);
-  bool kernel_globals_size(SyclQueue *queue, size_t &kernel_global_size);
+  bool kernel_globals_size(size_t &kernel_global_size);
   void set_global_memory(SyclQueue *queue,
                          void *kernel_globals,
                          const char *memory_name,
diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp
index 9632b14d485..3d019661aa8 100644
--- a/intern/cycles/device/oneapi/queue.cpp
+++ b/intern/cycles/device/oneapi/queue.cpp
@@ -43,7 +43,7 @@ int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
   return num_states;
 }
 
-int OneapiDeviceQueue::num_concurrent_busy_states() const
+int OneapiDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const
 {
   const int max_num_threads = oneapi_device_->get_num_multiprocessors() *
                               oneapi_device_->get_max_num_threads_per_multiprocessor();
diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h
index 32363bf2a6e..bbd947b49cb 100644
--- a/intern/cycles/device/oneapi/queue.h
+++ b/intern/cycles/device/oneapi/queue.h
@@ -25,7 +25,7 @@ class OneapiDeviceQueue : public DeviceQueue {
 
   virtual int num_concurrent_states(const size_t state_size) const override;
 
-  virtual int num_concurrent_busy_states() const override;
+  virtual int num_concurrent_busy_states(const size_t state_size) const override;
 
   virtual void init_execution() override;
 
diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h
index 1d6a8d736b7..e27e081a407 100644
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -103,7 +103,7 @@ class DeviceQueue {
   /* Number of states which keeps the device occupied with work without losing performance.
    * The renderer will add more work (when available) when number of active paths falls below this
    * value. */
-  virtual int num_concurrent_busy_states() const = 0;
+  virtual int num_concurrent_busy_states(const size_t state_size) const = 0;
 
   /* Number of elements in a partition of sorted shaders, that improves memory locality of
    * integrator state fetch at the cost of decreased coherence for shader kernel execution. */
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index 6b033cfd051..8e8fbd86be0 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -43,8 +43,11 @@ PathTrace::PathTrace(Device *device,
   /* Create path tracing work in advance, so that it can be reused by incremental sampling as much
    * as possible. */
   device_->foreach_device([&](Device *path_trace_device) {
-    path_trace_works_.emplace_back(PathTraceWork::create(
-        path_trace_device, film, device_scene, &render_cancel_.is_requested));
+    unique_ptr<PathTraceWork> work = PathTraceWork::create(
+        path_trace_device, film, device_scene, &render_cancel_.is_requested);
+    if (work) {
+      path_trace_works_.emplace_back(std::move(work));
+    }
   });
 
   work_balance_infos_.resize(path_trace_works_.size());
@@ -1293,6 +1296,7 @@ void PathTrace::set_guiding_params(const GuidingParams &guiding_params, const bo
 #  if OPENPGL_VERSION_MINOR >= 4
       field_args.deterministic = guiding_params.deterministic;
 #  endif
+      reinterpret_cast<PGLKDTreeArguments *>(field_args.spatialSturctureArguments)->maxDepth = 16;
       openpgl::cpp::Device *guiding_device = static_cast<openpgl::cpp::Device *>(
           device_->get_guiding_device());
       if (guiding_device) {
diff --git a/intern/cycles/integrator/path_trace_work.cpp b/intern/cycles/integrator/path_trace_work.cpp
index bb5c6e1a61a..a5f98b5475a 100644
--- a/intern/cycles/integrator/path_trace_work.cpp
+++ b/intern/cycles/integrator/path_trace_work.cpp
@@ -23,6 +23,10 @@ unique_ptr<PathTraceWork> PathTraceWork::create(Device *device,
   if (device->info.type == DEVICE_CPU) {
     return make_unique<PathTraceWorkCPU>(device, film, device_scene, cancel_requested_flag);
   }
+  if (device->info.type == DEVICE_DUMMY) {
+    /* Dummy devices can't perform any work. */
+    return nullptr;
+  }
 
   return make_unique<PathTraceWorkGPU>(device, film, device_scene, cancel_requested_flag);
 }
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
index d5ac830db58..188ec28cf65 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -285,7 +285,7 @@ void PathTraceWorkCPU::cryptomatte_postproces()
 }
 
 #ifdef WITH_PATH_GUIDING
-/* Note: It seems that this is called before every rendering iteration/progression and not once per
+/* NOTE: It seems that this is called before every rendering iteration/progression and not once per
  * rendering. May be we find a way to call it only once per rendering. */
 void PathTraceWorkCPU::guiding_init_kernel_globals(void *guiding_field,
                                                    void *sample_data_storage,
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index ee250a6916b..48f6cf3c903 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -18,13 +18,15 @@
 
 CCL_NAMESPACE_BEGIN
 
-static size_t estimate_single_state_size()
+static size_t estimate_single_state_size(const uint kernel_features)
 {
   size_t state_size = 0;
 
 #define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
-#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
-#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+  state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+  state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
 #define KERNEL_STRUCT_END(name) \
   break; \
   }
@@ -76,16 +78,11 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
       work_tiles_(device, "work_tiles", MEM_READ_WRITE),
       display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
-      max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size())),
-      min_num_active_main_paths_(queue_->num_concurrent_busy_states()),
+      max_num_paths_(0),
+      min_num_active_main_paths_(0),
       max_active_main_path_index_(0)
 {
   memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
-
-  /* Limit number of active paths to the half of the overall state. This is due to the logic in the
-   * path compaction which relies on the fact that regeneration does not happen sooner than half of
-   * the states are available again. */
-  min_num_active_main_paths_ = min(min_num_active_main_paths_, max_num_paths_ / 2);
 }
 
 void PathTraceWorkGPU::alloc_integrator_soa()
@@ -103,6 +100,20 @@ void PathTraceWorkGPU::alloc_integrator_soa()
   integrator_state_soa_volume_stack_size_ = max(integrator_state_soa_volume_stack_size_,
                                                 requested_volume_stack_size);
 
+  /* Deterine the number of path states. Deferring this for as long as possible allows the backend
+   * to make better decisions about memory availability. */
+  if (max_num_paths_ == 0) {
+    size_t single_state_size = estimate_single_state_size(kernel_features);
+
+    max_num_paths_ = queue_->num_concurrent_states(single_state_size);
+    min_num_active_main_paths_ = queue_->num_concurrent_busy_states(single_state_size);
+
+    /* Limit number of active paths to the half of the overall state. This is due to the logic in
+     * the path compaction which relies on the fact that regeneration does not happen sooner than
+     * half of the states are available again. */
+    min_num_active_main_paths_ = min(min_num_active_main_paths_, max_num_paths_ / 2);
+  }
+
   /* Allocate a device only memory buffer before for each struct member, and then
    * write the pointers into a struct that resides in constant memory.
    *
diff --git a/intern/cycles/integrator/work_balancer.cpp b/intern/cycles/integrator/work_balancer.cpp
index 5f1c6c92b9d..0fe170b2791 100644
--- a/intern/cycles/integrator/work_balancer.cpp
+++ b/intern/cycles/integrator/work_balancer.cpp
@@ -17,6 +17,9 @@ void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos)
     work_balance_infos[0].weight = 1.0;
     return;
   }
+  else if (num_infos == 0) {
+    return;
+  }
 
   /* There is no statistics available, so start with an equal distribution. */
   const double weight = 1.0 / num_infos;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 8f50c7586b8..81c5f593974 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -713,10 +713,17 @@ endif()
 # oneAPI module
 
 if(WITH_CYCLES_DEVICE_ONEAPI)
+  if(WITH_CYCLES_ONEAPI_BINARIES)
+    set(cycles_kernel_oneapi_lib_suffix "_aot")
+  else()
+    set(cycles_kernel_oneapi_lib_suffix "_jit")
+  endif()
+
   if(WIN32)
-    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.dll)
+    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.dll)
+    set(cycles_kernel_oneapi_linker_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.lib)
   else()
-    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/libcycles_kernel_oneapi.so)
+    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/libcycles_kernel_oneapi${cycles_kernel_oneapi_lib_suffix}.so)
   endif()
 
   set(cycles_oneapi_kernel_sources
@@ -727,16 +734,9 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
     ${SRC_UTIL_HEADERS}
   )
 
-  set (ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS 1)
+  set (SYCL_OFFLINE_COMPILER_PARALLEL_JOBS 1 CACHE STRING "Number of parallel compiler instances to use for device binaries compilation (expect ~8GB peak memory usage per instance).")
   if (WITH_CYCLES_ONEAPI_BINARIES)
-    cmake_host_system_information(RESULT AVAILABLE_MEMORY_AMOUNT QUERY AVAILABLE_PHYSICAL_MEMORY)
-    # Conservative value of peak consumption here, just to be fully sure that other backend compilers will have enough memory as well
-    set(ONEAPI_GPU_COMPILER_MEMORY_AT_PEAK_MB 8150)
-    math(EXPR ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS "${AVAILABLE_MEMORY_AMOUNT} / ${ONEAPI_GPU_COMPILER_MEMORY_AT_PEAK_MB}")
-    if (ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS LESS 1)
-      set(ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS 1)
-    endif()
-    message(STATUS "${ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS} instance(s) of oneAPI offline compiler will be used.")
+    message(STATUS "${SYCL_OFFLINE_COMPILER_PARALLEL_JOBS} instance(s) of oneAPI offline compiler will be used.")
   endif()
   # SYCL_CPP_FLAGS is a variable that the user can set to pass extra compiler options
   set(sycl_compiler_flags
@@ -747,7 +747,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
       -mllvm -inlinedefault-threshold=250
       -mllvm -inlinehint-threshold=350
       -fsycl-device-code-split=per_kernel
-      -fsycl-max-parallel-link-jobs=${ONEAPI_OFFLINE_COMPILER_PARALLEL_JOBS}
+      -fsycl-max-parallel-link-jobs=${SYCL_OFFLINE_COMPILER_PARALLEL_JOBS}
       -shared
       -DWITH_ONEAPI
       -ffast-math
@@ -758,10 +758,6 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
       ${SYCL_CPP_FLAGS}
       )
 
-  if (WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED)
-    list(APPEND sycl_compiler_flags -DWITH_ONEAPI_SYCL_HOST_ENABLED)
-  endif()
-
   # Set defaults for spir64 and spir64_gen options
   if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64)
     set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'")
@@ -774,6 +770,8 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
   string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "-device ${CYCLES_ONEAPI_SPIR64_GEN_DEVICES} ")
 
   if (WITH_CYCLES_ONEAPI_BINARIES)
+    # AoT binaries aren't currently reused when calling sycl::build.
+    list (APPEND sycl_compiler_flags -DSYCL_SKIP_KERNELS_PRELOAD)
     # Iterate over all targest and their options
     list (JOIN CYCLES_ONEAPI_SYCL_TARGETS "," targets_string)
     list (APPEND sycl_compiler_flags -fsycl-targets=${targets_string})
@@ -826,12 +824,17 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
     -DONEAPI_EXPORT)
 
     string(REPLACE /Redist/ /Tools/ MSVC_TOOLS_DIR ${MSVC_REDIST_DIR})
-    if(NOT CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION) # case for Ninja on Windows
+    # Version Folder between Redist and Tools can mismatch sometimes
+    if(NOT EXISTS ${MSVC_TOOLS_DIR})
+      get_filename_component(cmake_ar_dir ${CMAKE_AR} DIRECTORY)
+      get_filename_component(MSVC_TOOLS_DIR "${cmake_ar_dir}/../../../" ABSOLUTE)
+    endif()
+    if(CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION)
+      set(WINDOWS_KIT_DIR ${WINDOWS_KITS_DIR}/Lib/${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION})
+    else() # case for Ninja on Windows
       get_filename_component(cmake_mt_dir ${CMAKE_MT} DIRECTORY)
       string(REPLACE /bin/ /Lib/ WINDOWS_KIT_DIR ${cmake_mt_dir})
       get_filename_component(WINDOWS_KIT_DIR "${WINDOWS_KIT_DIR}/../" ABSOLUTE)
-    else()
-      set(WINDOWS_KIT_DIR ${WINDOWS_KITS_DIR}/Lib/${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION})
     endif()
     list(APPEND sycl_compiler_flags
                 -L "${MSVC_TOOLS_DIR}/lib/x64"
@@ -843,15 +846,13 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
     set(sycl_compiler_flags_RelWithDebInfo ${sycl_compiler_flags})
     set(sycl_compiler_flags_MinSizeRel ${sycl_compiler_flags})
     list(APPEND sycl_compiler_flags_RelWithDebInfo -g)
-    get_filename_component(sycl_library_debug_name ${SYCL_LIBRARY_DEBUG} NAME_WE)
     list(APPEND sycl_compiler_flags_Debug
                 -g
                 -D_DEBUG
-                -nostdlib -Xclang --dependent-lib=msvcrtd
-                -Xclang --dependent-lib=${sycl_library_debug_name})
+                -nostdlib -Xclang --dependent-lib=msvcrtd)
 
     add_custom_command(
-      OUTPUT ${cycles_kernel_oneapi_lib}
+      OUTPUT ${cycles_kernel_oneapi_lib} ${cycles_kernel_oneapi_linker_lib}
       COMMAND ${CMAKE_COMMAND} -E env
               "LIB=${sycl_compiler_root}/../lib" # for compiler to find sycl.lib
               "PATH=${OCLOC_INSTALL_DIR}\;${sycl_compiler_root}"
diff --git a/intern/cycles/kernel/bvh/shadow_all.h b/intern/cycles/kernel/bvh/shadow_all.h
index 2ffe1496c72..b31ba479e4f 100644
--- a/intern/cycles/kernel/bvh/shadow_all.h
+++ b/intern/cycles/kernel/bvh/shadow_all.h
@@ -229,7 +229,7 @@ ccl_device_inline
               /* Always use baked shadow transparency for curves. */
               if (isect.type & PRIMITIVE_CURVE) {
                 *r_throughput *= intersection_curve_shadow_transparency(
-                    kg, isect.object, isect.prim, isect.u);
+                    kg, isect.object, isect.prim, isect.type, isect.u);
 
                 if (*r_throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
                   return true;
diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h
index a57703a8b8c..9ba787550c5 100644
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -190,10 +190,8 @@ ccl_device_inline int intersection_find_attribute(KernelGlobals kg,
 /* Cut-off value to stop transparent shadow tracing when practically opaque. */
 #define CURVE_SHADOW_TRANSPARENCY_CUTOFF 0.001f
 
-ccl_device_inline float intersection_curve_shadow_transparency(KernelGlobals kg,
-                                                               const int object,
-                                                               const int prim,
-                                                               const float u)
+ccl_device_inline float intersection_curve_shadow_transparency(
+    KernelGlobals kg, const int object, const int prim, const int type, const float u)
 {
   /* Find attribute. */
   const int offset = intersection_find_attribute(kg, object, ATTR_STD_SHADOW_TRANSPARENCY);
@@ -204,7 +202,7 @@ ccl_device_inline float intersection_curve_shadow_transparency(KernelGlobals kg,
 
   /* Interpolate transparency between curve keys. */
   const KernelCurve kcurve = kernel_data_fetch(curves, prim);
-  const int k0 = kcurve.first_key + PRIMITIVE_UNPACK_SEGMENT(kcurve.type);
+  const int k0 = kcurve.first_key + PRIMITIVE_UNPACK_SEGMENT(type);
   const int k1 = k0 + 1;
 
   const float f0 = kernel_data_fetch(attributes_float, offset + k0);
diff --git a/intern/cycles/kernel/device/cpu/bvh.h b/intern/cycles/kernel/device/cpu/bvh.h
index d9267e1cd6d..2d7d8c2d704 100644
--- a/intern/cycles/kernel/device/cpu/bvh.h
+++ b/intern/cycles/kernel/device/cpu/bvh.h
@@ -252,7 +252,7 @@ ccl_device void kernel_embree_filter_occluded_func(const RTCFilterFunctionNArgum
       /* Always use baked shadow transparency for curves. */
       if (current_isect.type & PRIMITIVE_CURVE) {
         ctx->throughput *= intersection_curve_shadow_transparency(
-            kg, current_isect.object, current_isect.prim, current_isect.u);
+            kg, current_isect.object, current_isect.prim, current_isect.type, current_isect.u);
 
         if (ctx->throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
           ctx->opaque_hit = true;
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index c1df49c4f49..38cdcb572eb 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -23,22 +23,6 @@ CCL_NAMESPACE_BEGIN
  * and keep device specific code in compat.h */
 
 #ifdef __KERNEL_ONEAPI__
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-template<typename IsActiveOp>
-void cpu_serial_active_index_array_impl(const uint num_states,
-                                        ccl_global int *ccl_restrict indices,
-                                        ccl_global int *ccl_restrict num_indices,
-                                        IsActiveOp is_active_op)
-{
-  int write_index = 0;
-  for (int state_index = 0; state_index < num_states; state_index++) {
-    if (is_active_op(state_index))
-      indices[write_index++] = state_index;
-  }
-  *num_indices = write_index;
-  return;
-}
-#  endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */
 
 template<typename IsActiveOp>
 void gpu_parallel_active_index_array_impl(const uint num_states,
@@ -182,18 +166,11 @@ __device__
                                          num_simd_groups, \
                                          simdgroup_offset)
 #elif defined(__KERNEL_ONEAPI__)
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-#    define gpu_parallel_active_index_array( \
-        blocksize, num_states, indices, num_indices, is_active_op) \
-      if (ccl_gpu_global_size_x() == 1) \
-        cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \
-      else \
-        gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op);
-#  else
-#    define gpu_parallel_active_index_array( \
-        blocksize, num_states, indices, num_indices, is_active_op) \
-      gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
-#  endif
+
+#  define gpu_parallel_active_index_array( \
+      blocksize, num_states, indices, num_indices, is_active_op) \
+    gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
+
 #else
 
 #  define gpu_parallel_active_index_array( \
diff --git a/intern/cycles/kernel/device/metal/context_begin.h b/intern/cycles/kernel/device/metal/context_begin.h
index 99cb1e3826e..e75ec9cadec 100644
--- a/intern/cycles/kernel/device/metal/context_begin.h
+++ b/intern/cycles/kernel/device/metal/context_begin.h
@@ -34,21 +34,48 @@ class MetalKernelContext {
       kernel_assert(0);
       return 0;
     }
-
+    
+#ifdef __KERNEL_METAL_INTEL__
+    template<typename TextureType, typename CoordsType>
+    inline __attribute__((__always_inline__))
+    auto ccl_gpu_tex_object_read_intel_workaround(TextureType texture_array,
+                                                  const uint tid, const uint sid,
+                                                  CoordsType coords) const
+    {
+      switch(sid) {
+        default:
+        case 0: return texture_array[tid].tex.sample(sampler(address::repeat, filter::nearest), coords);
+        case 1: return texture_array[tid].tex.sample(sampler(address::clamp_to_edge, filter::nearest), coords);
+        case 2: return texture_array[tid].tex.sample(sampler(address::clamp_to_zero, filter::nearest), coords);
+        case 3: return texture_array[tid].tex.sample(sampler(address::repeat, filter::linear), coords);
+        case 4: return texture_array[tid].tex.sample(sampler(address::clamp_to_edge, filter::linear), coords);
+        case 5: return texture_array[tid].tex.sample(sampler(address::clamp_to_zero, filter::linear), coords);
+      }
+    }
+#endif
+    
     // texture2d
     template<>
     inline __attribute__((__always_inline__))
     float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, float x, float y) const {
       const uint tid(tex);
       const uint sid(tex >> 32);
+#ifndef __KERNEL_METAL_INTEL__
       return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y));
+#else
+      return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_2d, tid, sid, float2(x, y));
+#endif
     }
     template<>
     inline __attribute__((__always_inline__))
     float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, float x, float y) const {
       const uint tid(tex);
       const uint sid(tex >> 32);
+#ifndef __KERNEL_METAL_INTEL__
       return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
+#else
+      return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_2d, tid, sid, float2(x, y)).x;
+#endif
     }
 
     // texture3d
@@ -57,14 +84,22 @@ class MetalKernelContext {
     float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object_3D tex, float x, float y, float z) const {
       const uint tid(tex);
       const uint sid(tex >> 32);
+#ifndef __KERNEL_METAL_INTEL__
       return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z));
+#else
+      return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_3d, tid, sid, float3(x, y, z));
+#endif
     }
     template<>
     inline __attribute__((__always_inline__))
     float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object_3D tex, float x, float y, float z) const {
       const uint tid(tex);
       const uint sid(tex >> 32);
+#ifndef __KERNEL_METAL_INTEL__
       return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x;
+#else
+      return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_3d, tid, sid, float3(x, y, z)).x;
+#endif
     }
 #    include "kernel/device/gpu/image.h"
 
diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal
index 5646c7446db..8b69ee025cd 100644
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -228,7 +228,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
   /* Always use baked shadow transparency for curves. */
   if (type & PRIMITIVE_CURVE) {
     float throughput = payload.throughput;
-    throughput *= context.intersection_curve_shadow_transparency(nullptr, object, prim, u);
+    throughput *= context.intersection_curve_shadow_transparency(nullptr, object, prim, type, u);
     payload.throughput = throughput;
     payload.num_hits += 1;
 
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h
index 8ae40b0612e..dfaec65130c 100644
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -55,18 +55,6 @@
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers)
 #define ccl_gpu_kernel_threads(block_num_threads)
 
-#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-#  define KG_ND_ITEMS \
-  kg->nd_item_local_id_0 = item.get_local_id(0); \
-  kg->nd_item_local_range_0 = item.get_local_range(0); \
-  kg->nd_item_group_0 = item.get_group(0); \
-  kg->nd_item_group_range_0 = item.get_group_range(0); \
-  kg->nd_item_global_id_0 = item.get_global_id(0); \
-  kg->nd_item_global_range_0 = item.get_global_range(0);
-#else
-# define KG_ND_ITEMS
-#endif
-
 #define ccl_gpu_kernel_signature(name, ...) \
 void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
                           size_t kernel_global_size, \
@@ -76,8 +64,7 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
       (kg); \
       cgh.parallel_for<class kernel_##name>( \
           sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
-          [=](sycl::nd_item<1> item) { \
-            KG_ND_ITEMS
+          [=](sycl::nd_item<1> item) {
 
 #define ccl_gpu_kernel_postfix \
           }); \
@@ -95,31 +82,17 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
   } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)
 
 /* GPU thread, block, grid size and index */
-#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED
-#  define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
-#  define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
-#  define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
-#  define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
-#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
-#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-#  define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
-#  define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
-#else
-#  define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
-#  define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
-#  define ccl_gpu_block_idx_x (kg->nd_item_group_0)
-#  define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
-#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
-#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-#  define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
-#  define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
-#endif
+#define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
+#define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
+#define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
+#define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
+#define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
 
+#define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
+#define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
 
 /* GPU warp synchronization */
-
 #define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
 #define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
 #ifdef __SYCL_DEVICE_ONLY__
diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h
index d60f4f135ba..116620eb725 100644
--- a/intern/cycles/kernel/device/oneapi/globals.h
+++ b/intern/cycles/kernel/device/oneapi/globals.h
@@ -23,15 +23,6 @@ typedef struct KernelGlobalsGPU {
 #undef KERNEL_DATA_ARRAY
   IntegratorStateGPU *integrator_state;
   const KernelData *__data;
-#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-  size_t nd_item_local_id_0;
-  size_t nd_item_local_range_0;
-  size_t nd_item_group_0;
-  size_t nd_item_group_range_0;
-
-  size_t nd_item_global_id_0;
-  size_t nd_item_global_range_0;
-#endif
 } KernelGlobalsGPU;
 
 typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals;
diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp
index 1d1700f036d..525ae288f0c 100644
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -8,7 +8,7 @@
 #  include <map>
 #  include <set>
 
-#  include <CL/sycl.hpp>
+#  include <sycl/sycl.hpp>
 
 #  include "kernel/device/oneapi/compat.h"
 #  include "kernel/device/oneapi/globals.h"
@@ -25,38 +25,57 @@ void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
   s_error_user_ptr = user_ptr;
 }
 
-/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and
- * also trigger runtime compilation of all existing oneAPI kernels */
+/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality like
+ * memory allocations, memory transfers and execution of kernel with USM memory. */
 bool oneapi_run_test_kernel(SyclQueue *queue_)
 {
   assert(queue_);
   sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  size_t N = 8;
-  sycl::buffer<float, 1> A(N);
-  sycl::buffer<float, 1> B(N);
-
-  {
-    sycl::host_accessor A_host_acc(A, sycl::write_only);
-    for (size_t i = (size_t)0; i < N; i++)
-      A_host_acc[i] = rand() % 32;
-  }
+  const size_t N = 8;
+  const size_t memory_byte_size = sizeof(int) * N;
 
+  bool is_computation_correct = true;
   try {
-    queue->submit([&](sycl::handler &cgh) {
-      sycl::accessor A_acc(A, cgh, sycl::read_only);
-      sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init);
+    int *A_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue);
+
+    for (size_t i = (size_t)0; i < N; i++) {
+      A_host[i] = rand() % 32;
+    }
+
+    int *A_device = (int *)sycl::malloc_device(memory_byte_size, *queue);
+    int *B_device = (int *)sycl::malloc_device(memory_byte_size, *queue);
 
-      cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); });
+    queue->memcpy(A_device, A_host, memory_byte_size);
+    queue->wait_and_throw();
+
+    queue->submit([&](sycl::handler &cgh) {
+      cgh.parallel_for(N, [=](sycl::id<1> idx) { B_device[idx] = A_device[idx] + idx.get(0); });
     });
     queue->wait_and_throw();
 
-    sycl::host_accessor A_host_acc(A, sycl::read_only);
-    sycl::host_accessor B_host_acc(B, sycl::read_only);
+    int *B_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue);
+
+    queue->memcpy(B_host, B_device, memory_byte_size);
+    queue->wait_and_throw();
 
     for (size_t i = (size_t)0; i < N; i++) {
-      float result = A_host_acc[i] + B_host_acc[i];
-      (void)result;
+      const int expected_result = i + A_host[i];
+      if (B_host[i] != expected_result) {
+        is_computation_correct = false;
+        if (s_error_cb) {
+          s_error_cb(("Incorrect result in test kernel execution -  expected " +
+                      std::to_string(expected_result) + ", got " + std::to_string(B_host[i]))
+                         .c_str(),
+                     s_error_user_ptr);
+        }
+      }
     }
+
+    sycl::free(A_host, *queue);
+    sycl::free(B_host, *queue);
+    sycl::free(A_device, *queue);
+    sycl::free(B_device, *queue);
+    queue->wait_and_throw();
   }
   catch (sycl::exception const &e) {
     if (s_error_cb) {
@@ -65,7 +84,7 @@ bool oneapi_run_test_kernel(SyclQueue *queue_)
     return false;
   }
 
-  return true;
+  return is_computation_correct;
 }
 
 /* TODO: Move device information to OneapiDevice initialized on creation and use it. */
@@ -123,6 +142,56 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue,
   return std::min(limit_work_group_size, preferred_work_group_size);
 }
 
+bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features)
+{
+#  ifdef SYCL_SKIP_KERNELS_PRELOAD
+  (void)queue_;
+  (void)requested_features;
+#  else
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+
+  try {
+    sycl::kernel_bundle<sycl::bundle_state::input> all_kernels_bundle =
+        sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(),
+                                                           {queue->get_device()});
+
+    for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) {
+      const std::string &kernel_name = kernel_id.get_name();
+
+      /* NOTE(@nsirgien): Names in this conditions below should match names from
+       * oneapi_call macro in oneapi_enqueue_kernel below */
+      if (((requested_features & KERNEL_FEATURE_VOLUME) == 0) &&
+          kernel_name.find("oneapi_kernel_integrator_shade_volume") != std::string::npos) {
+        continue;
+      }
+
+      if (((requested_features & KERNEL_FEATURE_MNEE) == 0) &&
+          kernel_name.find("oneapi_kernel_integrator_shade_surface_mnee") != std::string::npos) {
+        continue;
+      }
+
+      if (((requested_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) &&
+          kernel_name.find("oneapi_kernel_integrator_shade_surface_raytrace") !=
+              std::string::npos) {
+        continue;
+      }
+
+      sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle =
+          sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), {kernel_id});
+      sycl::build(one_kernel_bundle);
+    }
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+#  endif
+  return true;
+}
+
 bool oneapi_enqueue_kernel(KernelContext *kernel_context,
                            int kernel,
                            size_t global_size,
@@ -161,13 +230,6 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
     /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
      * we extend work size to fit uniformity requirements. */
     global_size = groups_count * local_size;
-
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-    if (queue->get_device().is_host()) {
-      global_size = 1;
-      local_size = 1;
-    }
-#  endif
   }
 
   /* Let the compiler throw an error if there are any kernels missing in this implementation. */
diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h
index 7456d0e4902..2bfc0b89c87 100644
--- a/intern/cycles/kernel/device/oneapi/kernel.h
+++ b/intern/cycles/kernel/device/oneapi/kernel.h
@@ -48,6 +48,8 @@ CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,
                                                        int kernel,
                                                        size_t global_size,
                                                        void **args);
+CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_load_kernels(SyclQueue *queue,
+                                                     const unsigned int requested_features);
 #  ifdef __cplusplus
 }
 #  endif
diff --git a/intern/cycles/kernel/device/optix/bvh.h b/intern/cycles/kernel/device/optix/bvh.h
index fb9907709ce..6d81b44660c 100644
--- a/intern/cycles/kernel/device/optix/bvh.h
+++ b/intern/cycles/kernel/device/optix/bvh.h
@@ -202,7 +202,7 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
   /* Always use baked shadow transparency for curves. */
   if (type & PRIMITIVE_CURVE) {
     float throughput = __uint_as_float(optixGetPayload_1());
-    throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u);
+    throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, type, u);
     optixSetPayload_1(__float_as_uint(throughput));
     optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1));
 
diff --git a/intern/cycles/kernel/integrator/mnee.h b/intern/cycles/kernel/integrator/mnee.h
index 038f0379bbc..23885306885 100644
--- a/intern/cycles/kernel/integrator/mnee.h
+++ b/intern/cycles/kernel/integrator/mnee.h
@@ -279,7 +279,15 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg,
 }
 
 /* Compute constraint derivatives. */
-ccl_device_forceinline bool mnee_compute_constraint_derivatives(
+
+#  if defined(__KERNEL_METAL__)
+/* Temporary workaround for front-end compilation bug (incorrect MNEE rendering when this is
+ * inlined). */
+__attribute__((noinline))
+#  else
+ccl_device_forceinline
+#  endif
+bool mnee_compute_constraint_derivatives(
     int vertex_count,
     ccl_private ManifoldVertex *vertices,
     ccl_private const float3 &surface_sample_pos,
diff --git a/intern/cycles/kernel/integrator/state_flow.h b/intern/cycles/kernel/integrator/state_flow.h
index 4b03c665e17..40961b1c5fb 100644
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@@ -76,6 +76,9 @@ ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
       &kernel_integrator_state.next_shadow_path_index[0], 1);
   atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
   INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
+#  ifdef __PATH_GUIDING__
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, path_segment) = nullptr;
+#  endif
   return shadow_state;
 }
 
@@ -181,6 +184,9 @@ ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
 {
   IntegratorShadowState shadow_state = (is_ao) ? &state->ao : &state->shadow;
   INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
+#  ifdef __PATH_GUIDING__
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, path_segment) = nullptr;
+#  endif
   return shadow_state;
 }
 
diff --git a/intern/cycles/kernel/sample/pattern.h b/intern/cycles/kernel/sample/pattern.h
index ebdecc1bff9..e12f333b3a5 100644
--- a/intern/cycles/kernel/sample/pattern.h
+++ b/intern/cycles/kernel/sample/pattern.h
@@ -100,7 +100,7 @@ ccl_device_inline bool sample_is_class_A(int pattern, int sample)
   if (!(pattern == SAMPLING_PATTERN_PMJ || pattern == SAMPLING_PATTERN_SOBOL_BURLEY)) {
     /* Fallback: assign samples randomly.
      * This is guaranteed to work "okay" for any sampler, but isn't good.
-     * (Note: the seed constant is just a random number to guard against
+     * (NOTE: the seed constant is just a random number to guard against
      * possible interactions with other uses of the hash. There's nothing
      * special about it.)
      */
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index 1469d915d15..8f7cfd19169 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -85,9 +85,9 @@ CCL_NAMESPACE_BEGIN
 #  define __VOLUME_RECORD_ALL__
 #endif /* !__KERNEL_GPU__ */
 
-/* MNEE currently causes "Compute function exceeds available temporary registers"
- * on Metal, disabled for now. */
-#ifndef __KERNEL_METAL__
+/* MNEE caused "Compute function exceeds available temporary registers" in macOS < 13 due to a bug
+ * in spill buffer allocation sizing. */
+#if !defined(__KERNEL_METAL__) || (__KERNEL_METAL_MACOS__ >= 13)
 #  define __MNEE__
 #endif
 
diff --git a/intern/cycles/scene/image_oiio.cpp b/intern/cycles/scene/image_oiio.cpp
index 8792393e5a1..7bcf1ccb073 100644
--- a/intern/cycles/scene/image_oiio.cpp
+++ b/intern/cycles/scene/image_oiio.cpp
@@ -192,8 +192,22 @@ bool OIIOImageLoader::load_pixels(const ImageMetaData &metadata,
     return false;
   }
 
-  const bool do_associate_alpha = associate_alpha &&
-                                  spec.get_int_attribute("oiio:UnassociatedAlpha", 0);
+  bool do_associate_alpha = false;
+  if (associate_alpha) {
+    do_associate_alpha = spec.get_int_attribute("oiio:UnassociatedAlpha", 0);
+
+    if (!do_associate_alpha && spec.alpha_channel != -1) {
+      /* Workaround OIIO not detecting TGA file alpha the same as Blender (since #3019).
+       * We want anything not marked as premultiplied alpha to get associated. */
+      if (strcmp(in->format_name(), "targa") == 0) {
+        do_associate_alpha = spec.get_int_attribute("targa:alpha_type", -1) != 4;
+      }
+      /* OIIO DDS reader never sets UnassociatedAlpha attribute. */
+      if (strcmp(in->format_name(), "dds") == 0) {
+        do_associate_alpha = true;
+      }
+    }
+  }
 
   switch (metadata.type) {
     case IMAGE_DATA_TYPE_BYTE:
diff --git a/intern/cycles/session/session.cpp b/intern/cycles/session/session.cpp
index a0eb3196a34..acaa55f4990 100644
--- a/intern/cycles/session/session.cpp
+++ b/intern/cycles/session/session.cpp
@@ -43,6 +43,10 @@ Session::Session(const SessionParams &params_, const SceneParams &scene_params)
 
   device = Device::create(params.device, stats, profiler);
 
+  if (device->have_error()) {
+    progress.set_error(device->error_message());
+  }
+
   scene = new Scene(scene_params, device);
 
   /* Configure path tracer. */
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index 0905b3ec5c9..3a2e0e074a2 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -417,15 +417,11 @@ ccl_device_inline int floor_to_int(float f)
   return float_to_int(floorf(f));
 }
 
-ccl_device_inline int quick_floor_to_int(float x)
-{
-  return float_to_int(x) - ((x < 0) ? 1 : 0);
-}
-
 ccl_device_inline float floorfrac(float x, ccl_private int *i)
 {
-  *i = quick_floor_to_int(x);
-  return x - *i;
+  float f = floorf(x);
+  *i = float_to_int(f);
+  return x - f;
 }
 
 ccl_device_inline int ceil_to_int(float f)
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index c408eadf195..eec7122b9dc 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -535,18 +535,6 @@ ccl_device_inline float3 pow(float3 v, float e)
   return make_float3(powf(v.x, e), powf(v.y, e), powf(v.z, e));
 }
 
-ccl_device_inline int3 quick_floor_to_int3(const float3 a)
-{
-#ifdef __KERNEL_SSE__
-  int3 b = int3(_mm_cvttps_epi32(a.m128));
-  int3 isneg = int3(_mm_castps_si128(_mm_cmplt_ps(a.m128, _mm_set_ps1(0.0f))));
-  /* Unsaturated add 0xffffffff is the same as subtract -1. */
-  return b + isneg;
-#else
-  return make_int3(quick_floor_to_int(a.x), quick_floor_to_int(a.y), quick_floor_to_int(a.z));
-#endif
-}
-
 ccl_device_inline bool isfinite_safe(float3 v)
 {
   return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z);
diff --git a/intern/cycles/util/ssef.h b/intern/cycles/util/ssef.h
index a2fff94303e..1e2bfa90354 100644
--- a/intern/cycles/util/ssef.h
+++ b/intern/cycles/util/ssef.h
@@ -5,6 +5,8 @@
 #ifndef __UTIL_SSEF_H__
 #define __UTIL_SSEF_H__
 
+#include <math.h>
+
 #include "util/ssei.h"
 
 CCL_NAMESPACE_BEGIN
@@ -521,7 +523,7 @@ __forceinline const ssef round_zero(const ssef &a)
 __forceinline const ssef floor(const ssef &a)
 {
 #    ifdef __KERNEL_NEON__
-  return vrndnq_f32(a);
+  return vrndmq_f32(a);
 #    else
   return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
 #    endif
@@ -534,6 +536,12 @@ __forceinline const ssef ceil(const ssef &a)
   return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
 #    endif
 }
+#  else
+/* Non-SSE4.1 fallback, needed for floorfrac. */
+__forceinline const ssef floor(const ssef &a)
+{
+  return _mm_set_ps(floorf(a.f[3]), floorf(a.f[2]), floorf(a.f[1]), floorf(a.f[0]));
+}
 #  endif
 
 __forceinline ssei truncatei(const ssef &a)
@@ -541,20 +549,11 @@ __forceinline ssei truncatei(const ssef &a)
   return _mm_cvttps_epi32(a.m128);
 }
 
-/* This is about 25% faster than straightforward floor to integer conversion
- * due to better pipelining.
- *
- * Unsaturated add 0xffffffff (a < 0) is the same as subtract -1.
- */
-__forceinline ssei floori(const ssef &a)
-{
-  return truncatei(a) + cast((a < 0.0f).m128);
-}
-
 __forceinline ssef floorfrac(const ssef &x, ssei *i)
 {
-  *i = floori(x);
-  return x - ssef(*i);
+  ssef f = floor(x);
+  *i = truncatei(f);
+  return x - f;
 }
 
 ////////////////////////////////////////////////////////////////////////////////