Merge branch 'master' into cycles_oneapi

author: Sergey Sharybin <sergey@blender.org> 2022-06-08 11:58:35 +0300
committer: Sergey Sharybin <sergey@blender.org> 2022-06-08 11:58:35 +0300
commit: f31cef6248fd12039a4d2dfb76b26f3426477a70 (patch)
tree: 4ffb531519115c06766354c4fa4acaebeeb5161e /intern/cycles
parent: 15ee45fa029de175b5e366b0b1e9243a89dae543 (diff)
parent: 5b0e9bd97504cd89f57dbbaa4814b7d0dd0f2ccd (diff)
73 files changed, 1066 insertions, 398 deletions
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index 74b28b8ea21..05f27bdbd4d 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2011-2022 Blender Foundation
-
-# <pep8 compliant>
 from __future__ import annotations
 
 bl_info = {
diff --git a/intern/cycles/blender/addon/camera.py b/intern/cycles/blender/addon/camera.py
index 0e78112699e..3c821c98128 100644
--- a/intern/cycles/blender/addon/camera.py
+++ b/intern/cycles/blender/addon/camera.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2011-2022 Blender Foundation
 
-# <pep8 compliant>
-
 # Fit to match default projective camera with focal_length 50 and sensor_width 36.
 default_fisheye_polynomial = [
     -1.1735143712967577e-05,
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 724e1b8f727..e211f53cf31 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2011-2022 Blender Foundation
-
-# <pep8 compliant>
 from __future__ import annotations
 
 
diff --git a/intern/cycles/blender/addon/operators.py b/intern/cycles/blender/addon/operators.py
index e5d7f00a381..ab474cda0ab 100644
--- a/intern/cycles/blender/addon/operators.py
+++ b/intern/cycles/blender/addon/operators.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2011-2022 Blender Foundation
-
-# <pep8 compliant>
 from __future__ import annotations
 
 import bpy
diff --git a/intern/cycles/blender/addon/osl.py b/intern/cycles/blender/addon/osl.py
index 9430dc5d115..1ee7ae421e3 100644
--- a/intern/cycles/blender/addon/osl.py
+++ b/intern/cycles/blender/addon/osl.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2011-2022 Blender Foundation
-
-# <pep8 compliant>
 from __future__ import annotations
 
 import bpy
diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py
index 5eaa592a9de..cc6d574da99 100644
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2011-2022 Blender Foundation
-
-# <pep8 compliant>
 from __future__ import annotations
 
 from bl_operators.presets import AddPresetBase
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 021b61d4751..5835d29c088 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2011-2022 Blender Foundation
-
-# <pep8 compliant>
 from __future__ import annotations
 
 import bpy
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 71f64d7d360..8565ad7b263 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2011-2022 Blender Foundation
-
-# <pep8 compliant>
 from __future__ import annotations
 
 import bpy
diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py
index 531ecc177da..12880496dfd 100644
--- a/intern/cycles/blender/addon/version_update.py
+++ b/intern/cycles/blender/addon/version_update.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2011-2022 Blender Foundation
-
-# <pep8 compliant>
 from __future__ import annotations
 
 import bpy
diff --git a/intern/cycles/blender/image.cpp b/intern/cycles/blender/image.cpp
index e01b72c1653..aea79ad60ad 100644
--- a/intern/cycles/blender/image.cpp
+++ b/intern/cycles/blender/image.cpp
@@ -7,6 +7,8 @@
 #include "blender/session.h"
 #include "blender/util.h"
 
+#include "util/half.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Packed Images */
@@ -19,17 +21,40 @@ BlenderImageLoader::BlenderImageLoader(BL::Image b_image,
       frame(frame),
       tile_number(tile_number),
       /* Don't free cache for preview render to avoid race condition from T93560, to be fixed
-         properly later as we are close to release. */
+       * properly later as we are close to release. */
       free_cache(!is_preview_render && !b_image.has_data())
 {
 }
 
 bool BlenderImageLoader::load_metadata(const ImageDeviceFeatures &, ImageMetaData &metadata)
 {
-  metadata.width = b_image.size()[0];
-  metadata.height = b_image.size()[1];
+  if (b_image.source() != BL::Image::source_TILED) {
+    /* Image sequence might have different dimensions, and hence needs to be handled in a special
+     * manner.
+     * NOTE: Currently the sequences are not handled by this image loader. */
+    assert(b_image.source() != BL::Image::source_SEQUENCE);
+
+    metadata.width = b_image.size()[0];
+    metadata.height = b_image.size()[1];
+    metadata.channels = b_image.channels();
+  }
+  else {
+    /* Different UDIM tiles might have different resolutions, so get resolution from the actual
+     * tile. */
+    BL::UDIMTile b_udim_tile = b_image.tiles.get(tile_number);
+    if (b_udim_tile) {
+      metadata.width = b_udim_tile.size()[0];
+      metadata.height = b_udim_tile.size()[1];
+      metadata.channels = b_udim_tile.channels();
+    }
+    else {
+      metadata.width = 0;
+      metadata.height = 0;
+      metadata.channels = 0;
+    }
+  }
+
   metadata.depth = 1;
-  metadata.channels = b_image.channels();
 
   if (b_image.is_float()) {
     if (metadata.channels == 1) {
@@ -62,80 +87,134 @@ bool BlenderImageLoader::load_metadata(const ImageDeviceFeatures &, ImageMetaDat
 }
 
 bool BlenderImageLoader::load_pixels(const ImageMetaData &metadata,
-                                     void *pixels,
-                                     const size_t pixels_size,
+                                     void *out_pixels,
+                                     const size_t out_pixels_size,
                                      const bool associate_alpha)
 {
   const size_t num_pixels = ((size_t)metadata.width) * metadata.height;
   const int channels = metadata.channels;
 
-  if (b_image.is_float()) {
-    /* image data */
-    float *image_pixels;
-    image_pixels = image_get_float_pixels_for_frame(b_image, frame, tile_number);
+  if (metadata.type == IMAGE_DATA_TYPE_FLOAT || metadata.type == IMAGE_DATA_TYPE_FLOAT4) {
+    /* Float. */
+    float *in_pixels = image_get_float_pixels_for_frame(b_image, frame, tile_number);
 
-    if (image_pixels && num_pixels * channels == pixels_size) {
-      memcpy(pixels, image_pixels, pixels_size * sizeof(float));
+    if (in_pixels && num_pixels * channels == out_pixels_size) {
+      /* Straight copy pixel data. */
+      memcpy(out_pixels, in_pixels, out_pixels_size * sizeof(float));
     }
     else {
+      /* Missing or invalid pixel data. */
       if (channels == 1) {
-        memset(pixels, 0, num_pixels * sizeof(float));
+        memset(out_pixels, 0, num_pixels * sizeof(float));
       }
       else {
-        const size_t num_pixels_safe = pixels_size / channels;
-        float *fp = (float *)pixels;
-        for (int i = 0; i < num_pixels_safe; i++, fp += channels) {
-          fp[0] = 1.0f;
-          fp[1] = 0.0f;
-          fp[2] = 1.0f;
+        const size_t num_pixels_safe = out_pixels_size / channels;
+        float *out_pixel = (float *)out_pixels;
+        for (int i = 0; i < num_pixels_safe; i++, out_pixel += channels) {
+          out_pixel[0] = 1.0f;
+          out_pixel[1] = 0.0f;
+          out_pixel[2] = 1.0f;
           if (channels == 4) {
-            fp[3] = 1.0f;
+            out_pixel[3] = 1.0f;
           }
         }
       }
     }
 
-    if (image_pixels) {
-      MEM_freeN(image_pixels);
+    if (in_pixels) {
+      MEM_freeN(in_pixels);
     }
   }
-  else {
-    unsigned char *image_pixels = image_get_pixels_for_frame(b_image, frame, tile_number);
+  else if (metadata.type == IMAGE_DATA_TYPE_HALF || metadata.type == IMAGE_DATA_TYPE_HALF4) {
+    /* Half float. Blender does not have a half type, but in some cases
+     * we up-sample byte to half to avoid precision loss for colorspace
+     * conversion. */
+    unsigned char *in_pixels = image_get_pixels_for_frame(b_image, frame, tile_number);
 
-    if (image_pixels && num_pixels * channels == pixels_size) {
-      memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char));
+    if (in_pixels && num_pixels * channels == out_pixels_size) {
+      /* Convert uchar to half. */
+      const uchar *in_pixel = in_pixels;
+      half *out_pixel = (half *)out_pixels;
+      if (associate_alpha && channels == 4) {
+        for (size_t i = 0; i < num_pixels; i++, in_pixel += 4, out_pixel += 4) {
+          const float alpha = util_image_cast_to_float(in_pixel[3]);
+          out_pixel[0] = float_to_half_image(util_image_cast_to_float(in_pixel[0]) * alpha);
+          out_pixel[1] = float_to_half_image(util_image_cast_to_float(in_pixel[1]) * alpha);
+          out_pixel[2] = float_to_half_image(util_image_cast_to_float(in_pixel[2]) * alpha);
+          out_pixel[3] = float_to_half_image(alpha);
+        }
+      }
+      else {
+        for (size_t i = 0; i < num_pixels; i++) {
+          for (int c = 0; c < channels; c++, in_pixel++, out_pixel++) {
+            *out_pixel = float_to_half_image(util_image_cast_to_float(*in_pixel));
+          }
+        }
+      }
     }
     else {
+      /* Missing or invalid pixel data. */
       if (channels == 1) {
-        memset(pixels, 0, pixels_size * sizeof(unsigned char));
+        memset(out_pixels, 0, num_pixels * sizeof(half));
       }
       else {
-        const size_t num_pixels_safe = pixels_size / channels;
-        unsigned char *cp = (unsigned char *)pixels;
-        for (size_t i = 0; i < num_pixels_safe; i++, cp += channels) {
-          cp[0] = 255;
-          cp[1] = 0;
-          cp[2] = 255;
+        const size_t num_pixels_safe = out_pixels_size / channels;
+        half *out_pixel = (half *)out_pixels;
+        for (int i = 0; i < num_pixels_safe; i++, out_pixel += channels) {
+          out_pixel[0] = float_to_half_image(1.0f);
+          out_pixel[1] = float_to_half_image(0.0f);
+          out_pixel[2] = float_to_half_image(1.0f);
           if (channels == 4) {
-            cp[3] = 255;
+            out_pixel[3] = float_to_half_image(1.0f);
           }
         }
       }
     }
 
-    if (image_pixels) {
-      MEM_freeN(image_pixels);
+    if (in_pixels) {
+      MEM_freeN(in_pixels);
     }
+  }
+  else {
+    /* Byte. */
+    unsigned char *in_pixels = image_get_pixels_for_frame(b_image, frame, tile_number);
+
+    if (in_pixels && num_pixels * channels == out_pixels_size) {
+      /* Straight copy pixel data. */
+      memcpy(out_pixels, in_pixels, out_pixels_size * sizeof(unsigned char));
 
-    if (associate_alpha) {
-      /* Premultiply, byte images are always straight for Blender. */
-      unsigned char *cp = (unsigned char *)pixels;
-      for (size_t i = 0; i < num_pixels; i++, cp += channels) {
-        cp[0] = (cp[0] * cp[3]) / 255;
-        cp[1] = (cp[1] * cp[3]) / 255;
-        cp[2] = (cp[2] * cp[3]) / 255;
+      if (associate_alpha && channels == 4) {
+        /* Premultiply, byte images are always straight for Blender. */
+        unsigned char *out_pixel = (unsigned char *)out_pixels;
+        for (size_t i = 0; i < num_pixels; i++, out_pixel += 4) {
+          out_pixel[0] = (out_pixel[0] * out_pixel[3]) / 255;
+          out_pixel[1] = (out_pixel[1] * out_pixel[3]) / 255;
+          out_pixel[2] = (out_pixel[2] * out_pixel[3]) / 255;
+        }
+      }
+    }
+    else {
+      /* Missing or invalid pixel data. */
+      if (channels == 1) {
+        memset(out_pixels, 0, out_pixels_size * sizeof(unsigned char));
+      }
+      else {
+        const size_t num_pixels_safe = out_pixels_size / channels;
+        unsigned char *out_pixel = (unsigned char *)out_pixels;
+        for (size_t i = 0; i < num_pixels_safe; i++, out_pixel += channels) {
+          out_pixel[0] = 255;
+          out_pixel[1] = 0;
+          out_pixel[2] = 255;
+          if (channels == 4) {
+            out_pixel[3] = 255;
+          }
+        }
       }
     }
+
+    if (in_pixels) {
+      MEM_freeN(in_pixels);
+    }
   }
 
   /* Free image buffers to save memory during render. */
diff --git a/intern/cycles/blender/mesh.cpp b/intern/cycles/blender/mesh.cpp
index c76ce3801d4..e2db52cc5c1 100644
--- a/intern/cycles/blender/mesh.cpp
+++ b/intern/cycles/blender/mesh.cpp
@@ -267,75 +267,62 @@ static void mikk_compute_tangents(
   genTangSpaceDefault(&context);
 }
 
-/* Create sculpt vertex color attributes. */
-static void attr_create_sculpt_vertex_color(Scene *scene,
-                                            Mesh *mesh,
-                                            BL::Mesh &b_mesh,
-                                            bool subdivision)
-{
-  for (BL::MeshVertColorLayer &l : b_mesh.sculpt_vertex_colors) {
-    const bool active_render = l.active_render();
-    AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE;
-    ustring vcol_name = ustring(l.name().c_str());
-
-    const bool need_vcol = mesh->need_attribute(scene, vcol_name) ||
-                           mesh->need_attribute(scene, vcol_std);
-
-    if (!need_vcol) {
-      continue;
-    }
-
-    AttributeSet &attributes = (subdivision) ? mesh->subd_attributes : mesh->attributes;
-    Attribute *vcol_attr = attributes.add(vcol_name, TypeRGBA, ATTR_ELEMENT_VERTEX);
-    vcol_attr->std = vcol_std;
-
-    float4 *cdata = vcol_attr->data_float4();
-    int numverts = b_mesh.vertices.length();
-
-    for (int i = 0; i < numverts; i++) {
-      *(cdata++) = get_float4(l.data[i].color());
-    }
-  }
-}
-
 template<typename TypeInCycles, typename GetValueAtIndex>
 static void fill_generic_attribute(BL::Mesh &b_mesh,
                                    TypeInCycles *data,
                                    const BL::Attribute::domain_enum b_domain,
+                                   const bool subdivision,
                                    const GetValueAtIndex &get_value_at_index)
 {
   switch (b_domain) {
     case BL::Attribute::domain_CORNER: {
-      for (BL::MeshLoopTriangle &t : b_mesh.loop_triangles) {
-        const int index = t.index() * 3;
-        BL::Array<int, 3> loops = t.loops();
-        data[index] = get_value_at_index(loops[0]);
-        data[index + 1] = get_value_at_index(loops[1]);
-        data[index + 2] = get_value_at_index(loops[2]);
+      if (subdivision) {
+        for (BL::MeshPolygon &p : b_mesh.polygons) {
+          int n = p.loop_total();
+          for (int i = 0; i < n; i++) {
+            *data = get_value_at_index(p.loop_start() + i);
+            data++;
+          }
+        }
+      }
+      else {
+        for (BL::MeshLoopTriangle &t : b_mesh.loop_triangles) {
+          const int index = t.index() * 3;
+          BL::Array<int, 3> loops = t.loops();
+          data[index] = get_value_at_index(loops[0]);
+          data[index + 1] = get_value_at_index(loops[1]);
+          data[index + 2] = get_value_at_index(loops[2]);
+        }
       }
       break;
     }
     case BL::Attribute::domain_EDGE: {
-      /* Average edge attributes at vertices. */
-      const size_t num_verts = b_mesh.vertices.length();
-      vector<int> count(num_verts, 0);
-
-      for (BL::MeshEdge &e : b_mesh.edges) {
-        BL::Array<int, 2> vertices = e.vertices();
-        TypeInCycles value = get_value_at_index(e.index());
-
-        data[vertices[0]] += value;
-        data[vertices[1]] += value;
-        count[vertices[0]]++;
-        count[vertices[1]]++;
+      if constexpr (std::is_same_v<TypeInCycles, uchar4>) {
+        /* uchar4 edge attributes do not exist, and averaging in place
+         * would not work. */
+        assert(0);
       }
+      else {
+        /* Average edge attributes at vertices. */
+        const size_t num_verts = b_mesh.vertices.length();
+        vector<int> count(num_verts, 0);
+
+        for (BL::MeshEdge &e : b_mesh.edges) {
+          BL::Array<int, 2> vertices = e.vertices();
+          TypeInCycles value = get_value_at_index(e.index());
+
+          data[vertices[0]] += value;
+          data[vertices[1]] += value;
+          count[vertices[0]]++;
+          count[vertices[1]]++;
+        }
 
-      for (size_t i = 0; i < num_verts; i++) {
-        if (count[i] > 1) {
-          data[i] /= (float)count[i];
+        for (size_t i = 0; i < num_verts; i++) {
+          if (count[i] > 1) {
+            data[i] /= (float)count[i];
+          }
         }
       }
-
       break;
     }
     case BL::Attribute::domain_POINT: {
@@ -346,8 +333,16 @@ static void fill_generic_attribute(BL::Mesh &b_mesh,
       break;
     }
     case BL::Attribute::domain_FACE: {
-      for (BL::MeshLoopTriangle &t : b_mesh.loop_triangles) {
-        data[t.index()] = get_value_at_index(t.polygon_index());
+      if (subdivision) {
+        const int num_polygons = b_mesh.polygons.length();
+        for (int i = 0; i < num_polygons; i++) {
+          data[i] = get_value_at_index(i);
+        }
+      }
+      else {
+        for (BL::MeshLoopTriangle &t : b_mesh.loop_triangles) {
+          data[t.index()] = get_value_at_index(t.polygon_index());
+        }
       }
       break;
     }
@@ -395,21 +390,22 @@ static void attr_create_generic(Scene *scene,
                                 const bool need_motion,
                                 const float motion_scale)
 {
-  if (subdivision) {
-    /* TODO: Handle subdivision correctly. */
-    return;
-  }
-  AttributeSet &attributes = mesh->attributes;
+  AttributeSet &attributes = (subdivision) ? mesh->subd_attributes : mesh->attributes;
   static const ustring u_velocity("velocity");
 
+  int attribute_index = 0;
+  int render_color_index = b_mesh.attributes.render_color_index();
+
   for (BL::Attribute &b_attribute : b_mesh.attributes) {
     const ustring name{b_attribute.name().c_str()};
+    const bool is_render_color = (attribute_index++ == render_color_index);
 
     if (need_motion && name == u_velocity) {
       attr_create_motion(mesh, b_attribute, motion_scale);
     }
 
-    if (!mesh->need_attribute(scene, name)) {
+    if (!(mesh->need_attribute(scene, name) ||
+          (is_render_color && mesh->need_attribute(scene, ATTR_STD_VERTEX_COLOR)))) {
       continue;
     }
     if (attributes.find(name)) {
@@ -445,15 +441,16 @@ static void attr_create_generic(Scene *scene,
         BL::FloatAttribute b_float_attribute{b_attribute};
         Attribute *attr = attributes.add(name, TypeFloat, element);
         float *data = attr->data_float();
-        fill_generic_attribute(
-            b_mesh, data, b_domain, [&](int i) { return b_float_attribute.data[i].value(); });
+        fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) {
+          return b_float_attribute.data[i].value();
+        });
         break;
       }
       case BL::Attribute::data_type_BOOLEAN: {
         BL::BoolAttribute b_bool_attribute{b_attribute};
         Attribute *attr = attributes.add(name, TypeFloat, element);
         float *data = attr->data_float();
-        fill_generic_attribute(b_mesh, data, b_domain, [&](int i) {
+        fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) {
           return (float)b_bool_attribute.data[i].value();
         });
         break;
@@ -462,25 +459,59 @@ static void attr_create_generic(Scene *scene,
         BL::IntAttribute b_int_attribute{b_attribute};
         Attribute *attr = attributes.add(name, TypeFloat, element);
         float *data = attr->data_float();
-        fill_generic_attribute(
-            b_mesh, data, b_domain, [&](int i) { return (float)b_int_attribute.data[i].value(); });
+        fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) {
+          return (float)b_int_attribute.data[i].value();
+        });
         break;
       }
       case BL::Attribute::data_type_FLOAT_VECTOR: {
         BL::FloatVectorAttribute b_vector_attribute{b_attribute};
         Attribute *attr = attributes.add(name, TypeVector, element);
         float3 *data = attr->data_float3();
-        fill_generic_attribute(b_mesh, data, b_domain, [&](int i) {
+        fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) {
           BL::Array<float, 3> v = b_vector_attribute.data[i].vector();
           return make_float3(v[0], v[1], v[2]);
         });
         break;
       }
+      case BL::Attribute::data_type_BYTE_COLOR: {
+        BL::ByteColorAttribute b_color_attribute{b_attribute};
+
+        if (element == ATTR_ELEMENT_CORNER) {
+          element = ATTR_ELEMENT_CORNER_BYTE;
+        }
+        Attribute *attr = attributes.add(name, TypeRGBA, element);
+        if (is_render_color) {
+          attr->std = ATTR_STD_VERTEX_COLOR;
+        }
+
+        if (element == ATTR_ELEMENT_CORNER_BYTE) {
+          uchar4 *data = attr->data_uchar4();
+          fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) {
+            /* Compress/encode vertex color using the sRGB curve. */
+            const float4 c = get_float4(b_color_attribute.data[i].color());
+            return color_float4_to_uchar4(color_linear_to_srgb_v4(c));
+          });
+        }
+        else {
+          float4 *data = attr->data_float4();
+          fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) {
+            BL::Array<float, 4> v = b_color_attribute.data[i].color();
+            return make_float4(v[0], v[1], v[2], v[3]);
+          });
+        }
+        break;
+      }
       case BL::Attribute::data_type_FLOAT_COLOR: {
         BL::FloatColorAttribute b_color_attribute{b_attribute};
+
         Attribute *attr = attributes.add(name, TypeRGBA, element);
+        if (is_render_color) {
+          attr->std = ATTR_STD_VERTEX_COLOR;
+        }
+
         float4 *data = attr->data_float4();
-        fill_generic_attribute(b_mesh, data, b_domain, [&](int i) {
+        fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) {
           BL::Array<float, 4> v = b_color_attribute.data[i].color();
           return make_float4(v[0], v[1], v[2], v[3]);
         });
@@ -490,7 +521,7 @@ static void attr_create_generic(Scene *scene,
         BL::Float2Attribute b_float2_attribute{b_attribute};
         Attribute *attr = attributes.add(name, TypeFloat2, element);
         float2 *data = attr->data_float2();
-        fill_generic_attribute(b_mesh, data, b_domain, [&](int i) {
+        fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) {
           BL::Array<float, 2> v = b_float2_attribute.data[i].vector();
           return make_float2(v[0], v[1]);
         });
@@ -503,69 +534,6 @@ static void attr_create_generic(Scene *scene,
   }
 }
 
-/* Create vertex color attributes. */
-static void attr_create_vertex_color(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh, bool subdivision)
-{
-  for (BL::MeshLoopColorLayer &l : b_mesh.vertex_colors) {
-    const bool active_render = l.active_render();
-    AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE;
-    ustring vcol_name = ustring(l.name().c_str());
-
-    const bool need_vcol = mesh->need_attribute(scene, vcol_name) ||
-                           mesh->need_attribute(scene, vcol_std);
-
-    if (!need_vcol) {
-      continue;
-    }
-
-    Attribute *vcol_attr = NULL;
-
-    if (subdivision) {
-      if (active_render) {
-        vcol_attr = mesh->subd_attributes.add(vcol_std, vcol_name);
-      }
-      else {
-        vcol_attr = mesh->subd_attributes.add(vcol_name, TypeRGBA, ATTR_ELEMENT_CORNER_BYTE);
-      }
-
-      uchar4 *cdata = vcol_attr->data_uchar4();
-
-      for (BL::MeshPolygon &p : b_mesh.polygons) {
-        int n = p.loop_total();
-        for (int i = 0; i < n; i++) {
-          float4 color = get_float4(l.data[p.loop_start() + i].color());
-          /* Compress/encode vertex color using the sRGB curve. */
-          *(cdata++) = color_float4_to_uchar4(color);
-        }
-      }
-    }
-    else {
-      if (active_render) {
-        vcol_attr = mesh->attributes.add(vcol_std, vcol_name);
-      }
-      else {
-        vcol_attr = mesh->attributes.add(vcol_name, TypeRGBA, ATTR_ELEMENT_CORNER_BYTE);
-      }
-
-      uchar4 *cdata = vcol_attr->data_uchar4();
-
-      for (BL::MeshLoopTriangle &t : b_mesh.loop_triangles) {
-        int3 li = get_int3(t.loops());
-        float4 c1 = get_float4(l.data[li[0]].color());
-        float4 c2 = get_float4(l.data[li[1]].color());
-        float4 c3 = get_float4(l.data[li[2]].color());
-
-        /* Compress/encode vertex color using the sRGB curve. */
-        cdata[0] = color_float4_to_uchar4(c1);
-        cdata[1] = color_float4_to_uchar4(c2);
-        cdata[2] = color_float4_to_uchar4(c3);
-
-        cdata += 3;
-      }
-    }
-  }
-}
-
 /* Create uv map attributes. */
 static void attr_create_uv_map(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh)
 {
@@ -1029,8 +997,6 @@ static void create_mesh(Scene *scene,
    * The calculate functions will check whether they're needed or not.
    */
   attr_create_pointiness(scene, mesh, b_mesh, subdivision);
-  attr_create_vertex_color(scene, mesh, b_mesh, subdivision);
-  attr_create_sculpt_vertex_color(scene, mesh, b_mesh, subdivision);
   attr_create_random_per_island(scene, mesh, b_mesh, subdivision);
   attr_create_generic(scene, mesh, b_mesh, subdivision, need_motion, motion_scale);
 
diff --git a/intern/cycles/blender/volume.cpp b/intern/cycles/blender/volume.cpp
index 8dd2d45c0b6..61b2f9ee276 100644
--- a/intern/cycles/blender/volume.cpp
+++ b/intern/cycles/blender/volume.cpp
@@ -219,7 +219,10 @@ static void sync_smoke_volume(
 
 class BlenderVolumeLoader : public VDBImageLoader {
  public:
-  BlenderVolumeLoader(BL::BlendData &b_data, BL::Volume &b_volume, const string &grid_name)
+  BlenderVolumeLoader(BL::BlendData &b_data,
+                      BL::Volume &b_volume,
+                      const string &grid_name,
+                      BL::VolumeRender::precision_enum precision_)
       : VDBImageLoader(grid_name), b_volume(b_volume)
   {
     b_volume.grids.load(b_data.ptr.data);
@@ -241,6 +244,22 @@ class BlenderVolumeLoader : public VDBImageLoader {
       }
     }
 #endif
+#ifdef WITH_NANOVDB
+    switch (precision_) {
+      case BL::VolumeRender::precision_FULL:
+        precision = 32;
+        break;
+      case BL::VolumeRender::precision_HALF:
+        precision = 16;
+        break;
+      default:
+      case BL::VolumeRender::precision_VARIABLE:
+        precision = 0;
+        break;
+    }
+#else
+    (void)precision_;
+#endif
   }
 
   BL::Volume b_volume;
@@ -318,7 +337,8 @@ static void sync_volume_object(BL::BlendData &b_data,
                             volume->attributes.add(std) :
                             volume->attributes.add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_VOXEL);
 
-      ImageLoader *loader = new BlenderVolumeLoader(b_data, b_volume, name.string());
+      ImageLoader *loader = new BlenderVolumeLoader(
+          b_data, b_volume, name.string(), b_render.precision());
       ImageParams params;
       params.frame = b_volume.grids.frame();
 
diff --git a/intern/cycles/bvh/build.cpp b/intern/cycles/bvh/build.cpp
index 79e9b800690..1df3517673e 100644
--- a/intern/cycles/bvh/build.cpp
+++ b/intern/cycles/bvh/build.cpp
@@ -811,7 +811,7 @@ BVHNode *BVHBuild::build_node(const BVHRange &range,
     /* unalignedLeafSAH = params.sah_primitive_cost * split.leafSAH; */
     unalignedSplitSAH = params.sah_node_cost * unaligned_split.bounds.half_area() +
                         params.sah_primitive_cost * unaligned_split.nodeSAH;
-    /* TOOD(sergey): Check we can create leaf already. */
+    /* TODO(sergey): Check we can create leaf already. */
     /* Check whether unaligned split is better than the regular one. */
     if (unalignedSplitSAH < splitSAH) {
       do_unalinged_split = true;
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index 5b875565c97..6bcd8152874 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -146,8 +146,8 @@ if(CYCLES_STANDALONE_REPOSITORY)
       -DOIIO_STATIC_DEFINE
     )
 
-    set(OPENIMAGEIO_INCLUDE_DIR ${OPENIMAGEIO_ROOT_DIR}/include) 
-    set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO_INCLUDE_DIR} ${OPENIMAGEIO_INCLUDE_DIR}/OpenImageIO) 
+    set(OPENIMAGEIO_INCLUDE_DIR ${OPENIMAGEIO_ROOT_DIR}/include)
+    set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO_INCLUDE_DIR} ${OPENIMAGEIO_INCLUDE_DIR}/OpenImageIO)
     # Special exceptions for libraries which needs explicit debug version
     set(OPENIMAGEIO_LIBRARIES
       optimized ${OPENIMAGEIO_ROOT_DIR}/lib/OpenImageIO.lib
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
index 6908ae5ead3..cb7e909a2d5 100644
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -457,6 +457,8 @@ void CUDADevice::reserve_local_memory(const uint kernel_features)
     /* Use the biggest kernel for estimation. */
     const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
                                          DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE :
+                                     (kernel_features & KERNEL_FEATURE_MNEE) ?
+                                         DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE :
                                          DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE;
 
     /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
@@ -1084,7 +1086,9 @@ void CUDADevice::tex_alloc(device_texture &mem)
   need_texture_info = true;
 
   if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3 &&
+      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FPN &&
+      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FP16) {
     CUDA_RESOURCE_DESC resDesc;
     memset(&resDesc, 0, sizeof(resDesc));
 
diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp
index 7159277b325..ea68c821166 100644
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -420,6 +420,8 @@ void HIPDevice::reserve_local_memory(const uint kernel_features)
     /* Use the biggest kernel for estimation. */
     const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
                                          DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE :
+                                     (kernel_features & KERNEL_FEATURE_MNEE) ?
+                                         DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE :
                                          DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE;
 
     /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
@@ -1042,7 +1044,9 @@ void HIPDevice::tex_alloc(device_texture &mem)
   need_texture_info = true;
 
   if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3 &&
+      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FPN &&
+      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FP16) {
     /* Bindless textures. */
     hipResourceDesc resDesc;
     memset(&resDesc, 0, sizeof(resDesc));
diff --git a/intern/cycles/device/kernel.cpp b/intern/cycles/device/kernel.cpp
index 072731a2af5..96a99cd62cd 100644
--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@@ -33,6 +33,8 @@ const char *device_kernel_as_string(DeviceKernel kernel)
       return "integrator_shade_surface";
     case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
       return "integrator_shade_surface_raytrace";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
+      return "integrator_shade_surface_mnee";
     case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
       return "integrator_shade_volume";
     case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
diff --git a/intern/cycles/device/memory.cpp b/intern/cycles/device/memory.cpp
index 4c068dbdd3e..40cf2573cfb 100644
--- a/intern/cycles/device/memory.cpp
+++ b/intern/cycles/device/memory.cpp
@@ -165,6 +165,8 @@ device_texture::device_texture(Device *device,
     case IMAGE_DATA_TYPE_BYTE:
     case IMAGE_DATA_TYPE_NANOVDB_FLOAT:
     case IMAGE_DATA_TYPE_NANOVDB_FLOAT3:
+    case IMAGE_DATA_TYPE_NANOVDB_FPN:
+    case IMAGE_DATA_TYPE_NANOVDB_FP16:
       data_type = TYPE_UCHAR;
       data_elements = 1;
       break;
diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm
index 086fbb093ba..09c4ace081e 100644
--- a/intern/cycles/device/metal/bvh.mm
+++ b/intern/cycles/device/metal/bvh.mm
@@ -11,6 +11,7 @@
 #  include "util/progress.h"
 
 #  include "device/metal/bvh.h"
+#  include "device/metal/util.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -18,6 +19,7 @@ CCL_NAMESPACE_BEGIN
     { \
       string str = string_printf(__VA_ARGS__); \
       progress.set_substatus(str); \
+      metal_printf("%s\n", str.c_str()); \
     }
 
 BVHMetal::BVHMetal(const BVHParams &params_,
diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h
index 7506b9b069f..0e6817d94f8 100644
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -31,6 +31,8 @@ class MetalDevice : public Device {
   string source[PSO_NUM];
   string source_md5[PSO_NUM];
 
+  bool capture_enabled = false;
+
   KernelParamsMetal launch_params = {0};
 
   /* MetalRT members ----------------------------------*/
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index e1438a9d6e2..086bf0af979 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -86,6 +86,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
     use_metalrt = (atoi(metalrt) != 0);
   }
 
+  if (getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
+    capture_enabled = true;
+  }
+
   MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
   arg_desc_params.dataType = MTLDataTypePointer;
   arg_desc_params.access = MTLArgumentAccessReadOnly;
@@ -394,7 +398,7 @@ MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem)
   }
 
   if (size > 0) {
-    if (mem.type == MEM_DEVICE_ONLY) {
+    if (mem.type == MEM_DEVICE_ONLY && !capture_enabled) {
       options = MTLResourceStorageModePrivate;
     }
 
@@ -697,8 +701,7 @@ void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
 void MetalDevice::tex_alloc(device_texture &mem)
 {
   /* Check that dimensions fit within maximum allowable size.
-     See https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-  */
+   * See: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf */
   if (mem.data_width > 16384 || mem.data_height > 16384) {
     set_error(string_printf(
         "Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index 304efc813ec..fec4cd80466 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -35,7 +35,8 @@ bool kernel_has_intersection(DeviceKernel device_kernel)
           device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
           device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
           device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
-          device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
+          device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE);
 }
 
 struct ShaderCache {
diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index 6cc84a20787..de20514de0b 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -12,8 +12,6 @@
 #  include "device/metal/util.h"
 #  include "kernel/device/metal/globals.h"
 
-#  define metal_printf VLOG(4) << string_printf
-
 CCL_NAMESPACE_BEGIN
 
 class MetalDevice;
@@ -77,6 +75,38 @@ class MetalDeviceQueue : public DeviceQueue {
 
   void close_compute_encoder();
   void close_blit_encoder();
+
+  bool verbose_tracing = false;
+
+  /* Per-kernel profiling (see CYCLES_METAL_PROFILING). */
+
+  struct TimingData {
+    DeviceKernel kernel;
+    int work_size;
+    uint64_t timing_id;
+  };
+  std::vector<TimingData> command_encoder_labels;
+  id<MTLSharedEvent> timing_shared_event = nil;
+  uint64_t timing_shared_event_id;
+  uint64_t command_buffer_start_timing_id;
+
+  struct TimingStats {
+    double total_time = 0.0;
+    uint64_t total_work_size = 0;
+    uint64_t num_dispatches = 0;
+  };
+  TimingStats timing_stats[DEVICE_KERNEL_NUM];
+  double last_completion_time = 0.0;
+
+  /* .gputrace capture (see CYCLES_DEBUG_METAL_CAPTURE_...). */
+
+  id<MTLCaptureScope> mtlCaptureScope = nil;
+  DeviceKernel capture_kernel;
+  int capture_dispatch = 0;
+  int capture_dispatch_counter = 0;
+  bool is_capturing = false;
+  bool is_capturing_to_disk = false;
+  bool has_captured_to_disk = false;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index ec10e091b25..8b2d5d81859 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -37,6 +37,61 @@ MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device)
   }
 
   wait_semaphore = dispatch_semaphore_create(0);
+
+  if (@available(macos 10.14, *)) {
+    if (getenv("CYCLES_METAL_PROFILING")) {
+      /* Enable per-kernel timing breakdown (shown at end of render). */
+      timing_shared_event = [mtlDevice newSharedEvent];
+    }
+    if (getenv("CYCLES_METAL_DEBUG")) {
+      /* Enable very verbose tracing (shows every dispatch). */
+      verbose_tracing = true;
+    }
+    timing_shared_event_id = 1;
+  }
+
+  capture_kernel = DeviceKernel(-1);
+  if (auto capture_kernel_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
+    /* Enable .gputrace capture for the specified DeviceKernel. */
+    MTLCaptureManager *captureManager = [MTLCaptureManager sharedCaptureManager];
+    mtlCaptureScope = [captureManager newCaptureScopeWithDevice:mtlDevice];
+    mtlCaptureScope.label = [NSString stringWithFormat:@"Cycles kernel dispatch"];
+    [captureManager setDefaultCaptureScope:mtlCaptureScope];
+
+    capture_dispatch = -1;
+    if (auto capture_dispatch_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_DISPATCH")) {
+      capture_dispatch = atoi(capture_dispatch_str);
+      capture_dispatch_counter = 0;
+    }
+
+    capture_kernel = DeviceKernel(atoi(capture_kernel_str));
+    printf("Capture kernel: %d = %s\n", capture_kernel, device_kernel_as_string(capture_kernel));
+
+    if (auto capture_url = getenv("CYCLES_DEBUG_METAL_CAPTURE_URL")) {
+      if (@available(macos 10.15, *)) {
+        if ([captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]) {
+
+          MTLCaptureDescriptor *captureDescriptor = [[MTLCaptureDescriptor alloc] init];
+          captureDescriptor.captureObject = mtlCaptureScope;
+          captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument;
+          captureDescriptor.outputURL = [NSURL fileURLWithPath:@(capture_url)];
+
+          NSError *error;
+          if (![captureManager startCaptureWithDescriptor:captureDescriptor error:&error]) {
+            NSString *err = [error localizedDescription];
+            printf("Start capture failed: %s\n", [err UTF8String]);
+          }
+          else {
+            printf("Capture started (URL: %s)\n", capture_url);
+            is_capturing_to_disk = true;
+          }
+        }
+        else {
+          printf("Capture to file is not supported\n");
+        }
+      }
+    }
+  }
 }
 
 MetalDeviceQueue::~MetalDeviceQueue()
@@ -58,6 +113,56 @@ MetalDeviceQueue::~MetalDeviceQueue()
     [mtlCommandQueue release];
     mtlCommandQueue = nil;
   }
+
+  if (mtlCaptureScope) {
+    [mtlCaptureScope release];
+  }
+
+  double total_time = 0.0;
+
+  /* Show per-kernel timings, if gathered (see CYCLES_METAL_PROFILING). */
+  int64_t total_work_size = 0;
+  int64_t num_dispatches = 0;
+  for (auto &stat : timing_stats) {
+    total_time += stat.total_time;
+    total_work_size += stat.total_work_size;
+    num_dispatches += stat.num_dispatches;
+  }
+
+  if (num_dispatches) {
+    printf("\nMetal dispatch stats:\n\n");
+    auto header = string_printf("%-40s %16s %12s %12s %7s %7s",
+                                "Kernel name",
+                                "Total threads",
+                                "Dispatches",
+                                "Avg. T/D",
+                                "Time",
+                                "Time%");
+    auto divider = string(header.length(), '-');
+    printf("%s\n%s\n%s\n", divider.c_str(), header.c_str(), divider.c_str());
+
+    for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
+      auto &stat = timing_stats[i];
+      if (stat.num_dispatches > 0) {
+        printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n",
+               device_kernel_as_string(DeviceKernel(i)),
+               string_human_readable_number(stat.total_work_size).c_str(),
+               string_human_readable_number(stat.num_dispatches).c_str(),
+               string_human_readable_number(stat.total_work_size / stat.num_dispatches).c_str(),
+               stat.total_time,
+               stat.total_time * 100.0 / total_time);
+      }
+    }
+    printf("%s\n", divider.c_str());
+    printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n",
+           "",
+           "",
+           string_human_readable_number(num_dispatches).c_str(),
+           "",
+           total_time,
+           100.0);
+    printf("%s\n\n", divider.c_str());
+  }
 }
 
 int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
@@ -101,6 +206,19 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
                                const int work_size,
                                DeviceKernelArguments const &args)
 {
+  if (kernel == capture_kernel) {
+    if (capture_dispatch < 0 || capture_dispatch == capture_dispatch_counter) {
+      /* Start gputrace capture. */
+      if (mtlCommandBuffer) {
+        synchronize();
+      }
+      [mtlCaptureScope beginScope];
+      printf("[mtlCaptureScope beginScope]\n");
+      is_capturing = true;
+    }
+    capture_dispatch_counter += 1;
+  }
+
   if (metal_device->have_error()) {
     return false;
   }
@@ -110,6 +228,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
 
   id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
 
+  if (timing_shared_event) {
+    command_encoder_labels.push_back({kernel, work_size, timing_shared_event_id});
+  }
+
   /* Determine size requirement for argument buffer. */
   size_t arg_buffer_length = 0;
   for (size_t i = 0; i < args.count; i++) {
@@ -189,6 +311,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
   /* Encode KernelParamsMetal buffers */
   [metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer offset:globals_offsets];
 
+  if (verbose_tracing || timing_shared_event || is_capturing) {
+    /* Add human-readable labels if we're doing any form of debugging / profiling. */
+    mtlComputeCommandEncoder.label = [[NSString alloc]
+        initWithFormat:@"Metal queue launch %s, work_size %d",
+                       device_kernel_as_string(kernel),
+                       work_size];
+  }
+
   /* this relies on IntegratorStateGPU layout being contiguous device_ptrs  */
   const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) +
                                    sizeof(IntegratorStateGPU);
@@ -196,7 +326,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
     int pointer_index = offset / sizeof(device_ptr);
     MetalDevice::MetalMem *mmem = *(
         MetalDevice::MetalMem **)((uint8_t *)&metal_device->launch_params + offset);
-    if (mmem && (mmem->mtlBuffer || mmem->mtlTexture)) {
+    if (mmem && mmem->mem && (mmem->mtlBuffer || mmem->mtlTexture)) {
       [metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
                                                      offset:0
                                                     atIndex:pointer_index];
@@ -270,6 +400,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
         case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
         case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
         case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
           break;
         default:
           bvhMetalRT = nil;
@@ -343,12 +474,53 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
     }
   }];
 
+  if (verbose_tracing || is_capturing) {
+    /* Force a sync we've enabled step-by-step verbose tracing or if we're capturing. */
+    synchronize();
+
+    /* Show queue counters and dispatch timing. */
+    if (verbose_tracing) {
+      if (kernel == DEVICE_KERNEL_INTEGRATOR_RESET) {
+        printf(
+            "_____________________________________.____________________.______________.___________"
+            "______________________________________\n");
+      }
+
+      printf("%-40s| %7d threads |%5.2fms | buckets [",
+             device_kernel_as_string(kernel),
+             work_size,
+             last_completion_time * 1000.0);
+      std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex);
+      for (auto &it : metal_device->metal_mem_map) {
+        const string c_integrator_queue_counter = "integrator_queue_counter";
+        if (it.first->name == c_integrator_queue_counter) {
+          /* Workaround "device_copy_from" being protected. */
+          struct MyDeviceMemory : device_memory {
+            void device_copy_from__IntegratorQueueCounter()
+            {
+              device_copy_from(0, data_width, 1, sizeof(IntegratorQueueCounter));
+            }
+          };
+          ((MyDeviceMemory *)it.first)->device_copy_from__IntegratorQueueCounter();
+
+          if (IntegratorQueueCounter *queue_counter = (IntegratorQueueCounter *)
+                                                          it.first->host_pointer) {
+            for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++)
+              printf("%s%d", i == 0 ? "" : ",", int(queue_counter->num_queued[i]));
+          }
+          break;
+        }
+      }
+      printf("]\n");
+    }
+  }
+
   return !(metal_device->have_error());
 }
 
 bool MetalDeviceQueue::synchronize()
 {
-  if (metal_device->have_error()) {
+  if (has_captured_to_disk || metal_device->have_error()) {
     return false;
   }
 
@@ -358,6 +530,28 @@ bool MetalDeviceQueue::synchronize()
   close_blit_encoder();
 
   if (mtlCommandBuffer) {
+    scoped_timer timer;
+    if (timing_shared_event) {
+      /* For per-kernel timing, add event handlers to measure & accumulate dispatch times. */
+      __block double completion_time = 0;
+      for (uint64_t i = command_buffer_start_timing_id; i < timing_shared_event_id; i++) {
+        [timing_shared_event notifyListener:shared_event_listener
+                                    atValue:i
+                                      block:^(id<MTLSharedEvent> sharedEvent, uint64_t value) {
+                                        completion_time = timer.get_time() - completion_time;
+                                        last_completion_time = completion_time;
+                                        for (auto label : command_encoder_labels) {
+                                          if (label.timing_id == value) {
+                                            TimingStats &stat = timing_stats[label.kernel];
+                                            stat.num_dispatches++;
+                                            stat.total_time += completion_time;
+                                            stat.total_work_size += label.work_size;
+                                          }
+                                        }
+                                      }];
+      }
+    }
+
     uint64_t shared_event_id = this->shared_event_id++;
 
     if (@available(macos 10.14, *)) {
@@ -373,6 +567,22 @@ bool MetalDeviceQueue::synchronize()
       dispatch_semaphore_wait(wait_semaphore, DISPATCH_TIME_FOREVER);
     }
 
+    if (is_capturing) {
+      [mtlCaptureScope endScope];
+      is_capturing = false;
+      printf("[mtlCaptureScope endScope]\n");
+
+      if (is_capturing_to_disk) {
+        if (@available(macos 10.15, *)) {
+          [[MTLCaptureManager sharedCaptureManager] stopCapture];
+          has_captured_to_disk = true;
+          is_capturing_to_disk = false;
+          is_capturing = false;
+          printf("Capture stopped\n");
+        }
+      }
+    }
+
     [mtlCommandBuffer release];
 
     for (const CopyBack &mmem : copy_back_mem) {
@@ -384,6 +594,7 @@ bool MetalDeviceQueue::synchronize()
     metal_device->flush_delayed_free_list();
 
     mtlCommandBuffer = nil;
+    command_encoder_labels.clear();
   }
 
   return !(metal_device->have_error());
@@ -529,6 +740,13 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel
 {
   bool concurrent = (kernel < DEVICE_KERNEL_INTEGRATOR_NUM);
 
+  if (timing_shared_event) {
+    /* Close the current encoder to ensure we're able to capture per-encoder timing data. */
+    if (mtlComputeEncoder) {
+      close_compute_encoder();
+    }
+  }
+
   if (@available(macos 10.14, *)) {
     if (mtlComputeEncoder) {
       if (mtlComputeEncoder.dispatchType == concurrent ? MTLDispatchTypeConcurrent :
@@ -574,6 +792,7 @@ id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()
   if (!mtlCommandBuffer) {
     mtlCommandBuffer = [mtlCommandQueue commandBuffer];
     [mtlCommandBuffer retain];
+    command_buffer_start_timing_id = timing_shared_event_id;
   }
 
   mtlBlitEncoder = [mtlCommandBuffer blitCommandEncoder];
@@ -584,6 +803,10 @@ void MetalDeviceQueue::close_compute_encoder()
 {
   [mtlComputeEncoder endEncoding];
   mtlComputeEncoder = nil;
+
+  if (timing_shared_event) {
+    [mtlCommandBuffer encodeSignalEvent:timing_shared_event value:timing_shared_event_id++];
+  }
 }
 
 void MetalDeviceQueue::close_blit_encoder()
diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h
index cc653ab7e12..f728967835d 100644
--- a/intern/cycles/device/metal/util.h
+++ b/intern/cycles/device/metal/util.h
@@ -14,6 +14,8 @@
 
 #  include "util/thread.h"
 
+#  define metal_printf VLOG(4) << string_printf
+
 CCL_NAMESPACE_BEGIN
 
 enum MetalGPUVendor {
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index 9fc265bc327..9ab9bbb59c5 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -452,9 +452,10 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   }
 
   { /* Load and compile PTX module with OptiX kernels. */
-    string ptx_data, ptx_filename = path_get((kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
-                                                 "lib/kernel_optix_shader_raytrace.ptx" :
-                                                 "lib/kernel_optix.ptx");
+    string ptx_data, ptx_filename = path_get(
+                         (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
+                             "lib/kernel_optix_shader_raytrace.ptx" :
+                             "lib/kernel_optix.ptx");
     if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
       if (!getenv("OPTIX_ROOT_DIR")) {
         set_error(
@@ -464,7 +465,9 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
       }
       ptx_filename = compile_kernel(
           kernel_features,
-          (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? "kernel_shader_raytrace" : "kernel",
+          (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
+              "kernel_shader_raytrace" :
+              "kernel",
           "optix",
           true);
     }
@@ -550,7 +553,8 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
       OptixBuiltinISOptions builtin_options = {};
 #  if OPTIX_ABI_VERSION >= 55
       builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM;
-      builtin_options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE;
+      builtin_options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE |
+                                   OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
       builtin_options.curveEndcapFlags = OPTIX_CURVE_ENDCAP_DEFAULT; /* Disable end-caps. */
 #  else
       builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
@@ -620,6 +624,14 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
         "__direct_callable__svm_node_bevel";
   }
 
+  /* MNEE. */
+  if (kernel_features & KERNEL_FEATURE_MNEE) {
+    group_descs[PG_RGEN_SHADE_SURFACE_MNEE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_surface_mnee";
+  }
+
   optix_assert(optixProgramGroupCreate(
       context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
 
@@ -701,6 +713,46 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
         pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2));
   }
 
+  if (kernel_features & KERNEL_FEATURE_MNEE) {
+    /* Create MNEE pipeline. */
+    vector<OptixProgramGroup> pipeline_groups;
+    pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
+    pipeline_groups.push_back(groups[PG_MISS]);
+    pipeline_groups.push_back(groups[PG_HITD]);
+    pipeline_groups.push_back(groups[PG_HITS]);
+    pipeline_groups.push_back(groups[PG_HITL]);
+    pipeline_groups.push_back(groups[PG_HITV]);
+    if (motion_blur) {
+      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+    }
+    if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
+      pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
+      pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
+    }
+    pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
+    pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
+
+    optix_assert(optixPipelineCreate(context,
+                                     &pipeline_options,
+                                     &link_options,
+                                     pipeline_groups.data(),
+                                     pipeline_groups.size(),
+                                     nullptr,
+                                     0,
+                                     &pipelines[PIP_SHADE_MNEE]));
+
+    /* Combine ray generation and trace continuation stack size. */
+    const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG +
+                             link_options.maxTraceDepth * trace_css;
+    const unsigned int dss = 0;
+
+    /* Set stack size depending on pipeline options. */
+    optix_assert(
+        optixPipelineSetStackSize(pipelines[PIP_SHADE_MNEE], 0, dss, css, motion_blur ? 3 : 2));
+  }
+
   { /* Create intersection-only pipeline. */
     vector<OptixProgramGroup> pipeline_groups;
     pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
@@ -1186,7 +1238,7 @@ bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context)
   const OptixResult result = optixDenoiserSetup(
       denoiser_.optix_denoiser,
       0, /* Work around bug in r495 drivers that causes artifacts when denoiser setup is called
-            on a stream that is not the default stream */
+          * on a stream that is not the default stream. */
       tile_size.x + denoiser_.sizes.overlapWindowSizeInPixels * 2,
       tile_size.y + denoiser_.sizes.overlapWindowSizeInPixels * 2,
       denoiser_.state.device_pointer,
@@ -1336,7 +1388,10 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
   OptixAccelBufferSizes sizes = {};
   OptixAccelBuildOptions options = {};
   options.operation = operation;
-  if (use_fast_trace_bvh) {
+  if (use_fast_trace_bvh ||
+      /* The build flags have to match the ones used to query the built-in curve intersection
+         program (see optixBuiltinISModuleGet above) */
+      build_input.type == OPTIX_BUILD_INPUT_TYPE_CURVES) {
     VLOG(2) << "Using fast to trace OptiX BVH";
     options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
   }
@@ -1861,7 +1916,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
       {
         /* Can disable __anyhit__kernel_optix_visibility_test by default (except for thick curves,
          * since it needs to filter out end-caps there).
-
+         *
          * It is enabled where necessary (visibility mask exceeds 8 bits or the other any-hit
          * programs like __anyhit__kernel_optix_shadow_all_hit) via OPTIX_RAY_FLAG_ENFORCE_ANYHIT.
          */
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
index 1f53c729c3f..817afdc8384 100644
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -24,6 +24,7 @@ enum {
   PG_RGEN_INTERSECT_SUBSURFACE,
   PG_RGEN_INTERSECT_VOLUME_STACK,
   PG_RGEN_SHADE_SURFACE_RAYTRACE,
+  PG_RGEN_SHADE_SURFACE_MNEE,
   PG_MISS,
   PG_HITD, /* Default hit group. */
   PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
@@ -46,7 +47,7 @@ static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
 static const int NUM_CALLABLE_PROGRAM_GROUPS = 2;
 
 /* List of OptiX pipelines. */
-enum { PIP_SHADE_RAYTRACE, PIP_INTERSECT, NUM_PIPELINES };
+enum { PIP_SHADE_RAYTRACE, PIP_SHADE_MNEE, PIP_INTERSECT, NUM_PIPELINES };
 
 /* A single shader binding table entry. */
 struct SbtRecord {
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index d635512c58a..366bf95269d 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -28,6 +28,7 @@ void OptiXDeviceQueue::init_execution()
 static bool is_optix_specific_kernel(DeviceKernel kernel)
 {
   return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
           kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
           kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
           kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
@@ -63,7 +64,8 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                         cuda_stream_));
 
   if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
     cuda_device_assert(
         cuda_device_,
         cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
@@ -82,6 +84,10 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
       pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE];
       sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord);
       break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
+      pipeline = optix_device->pipelines[PIP_SHADE_MNEE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_MNEE * sizeof(SbtRecord);
+      break;
     case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
       pipeline = optix_device->pipelines[PIP_INTERSECT];
       sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
diff --git a/intern/cycles/graph/node_enum.h b/intern/cycles/graph/node_enum.h
index b3a55efb327..6c8bfed7ec8 100644
--- a/intern/cycles/graph/node_enum.h
+++ b/intern/cycles/graph/node_enum.h
@@ -19,8 +19,10 @@ struct NodeEnum {
   }
   void insert(const char *x, int y)
   {
-    left[ustring(x)] = y;
-    right[y] = ustring(x);
+    ustring ustr_x(x);
+
+    left[ustr_x] = y;
+    right[y] = ustr_x;
   }
 
   bool exists(ustring x) const
diff --git a/intern/cycles/hydra/display_driver.cpp b/intern/cycles/hydra/display_driver.cpp
index a809ace63e2..0c0b577c358 100644
--- a/intern/cycles/hydra/display_driver.cpp
+++ b/intern/cycles/hydra/display_driver.cpp
@@ -23,10 +23,18 @@ HdCyclesDisplayDriver::HdCyclesDisplayDriver(HdCyclesSession *renderParam, Hgi *
 
 HdCyclesDisplayDriver::~HdCyclesDisplayDriver()
 {
-  deinit();
+  if (texture_) {
+    _hgi->DestroyTexture(&texture_);
+  }
+
+  if (gl_pbo_id_) {
+    glDeleteBuffers(1, &gl_pbo_id_);
+  }
+
+  gl_context_dispose();
 }
 
-void HdCyclesDisplayDriver::init()
+void HdCyclesDisplayDriver::gl_context_create()
 {
 #ifdef _WIN32
   if (!gl_context_) {
@@ -64,16 +72,42 @@ void HdCyclesDisplayDriver::init()
   }
 }
 
-void HdCyclesDisplayDriver::deinit()
+bool HdCyclesDisplayDriver::gl_context_enable()
 {
-  if (texture_) {
-    _hgi->DestroyTexture(&texture_);
+#ifdef _WIN32
+  if (!hdc_ || !gl_context_) {
+    return false;
   }
 
-  if (gl_pbo_id_) {
-    glDeleteBuffers(1, &gl_pbo_id_);
+  mutex_.lock();
+
+  // Do not change context if this is called in the main thread
+  if (wglGetCurrentContext() == nullptr) {
+    if (!TF_VERIFY(wglMakeCurrent((HDC)hdc_, (HGLRC)gl_context_))) {
+      mutex_.unlock();
+      return false;
+    }
+  }
+
+  return true;
+#else
+  return false;
+#endif
+}
+
+void HdCyclesDisplayDriver::gl_context_disable()
+{
+#ifdef _WIN32
+  if (wglGetCurrentContext() == gl_context_) {
+    TF_VERIFY(wglMakeCurrent(nullptr, nullptr));
   }
 
+  mutex_.unlock();
+#endif
+}
+
+void HdCyclesDisplayDriver::gl_context_dispose()
+{
 #ifdef _WIN32
   if (gl_context_) {
     TF_VERIFY(wglDeleteContext((HGLRC)gl_context_));
@@ -90,13 +124,9 @@ bool HdCyclesDisplayDriver::update_begin(const Params &params,
                                          int texture_width,
                                          int texture_height)
 {
-#ifdef _WIN32
-  if (!hdc_ || !gl_context_) {
+  if (!gl_context_enable()) {
     return false;
   }
-#endif
-
-  graphics_interop_activate();
 
   if (gl_render_sync_) {
     glWaitSync((GLsync)gl_render_sync_, 0, GL_TIMEOUT_IGNORED);
@@ -121,15 +151,14 @@ bool HdCyclesDisplayDriver::update_begin(const Params &params,
 void HdCyclesDisplayDriver::update_end()
 {
   gl_upload_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
-
   glFlush();
 
-  graphics_interop_deactivate();
+  gl_context_disable();
 }
 
 void HdCyclesDisplayDriver::flush()
 {
-  graphics_interop_activate();
+  gl_context_enable();
 
   if (gl_upload_sync_) {
     glWaitSync((GLsync)gl_upload_sync_, 0, GL_TIMEOUT_IGNORED);
@@ -139,7 +168,7 @@ void HdCyclesDisplayDriver::flush()
     glWaitSync((GLsync)gl_render_sync_, 0, GL_TIMEOUT_IGNORED);
   }
 
-  graphics_interop_deactivate();
+  gl_context_disable();
 }
 
 half4 *HdCyclesDisplayDriver::map_texture_buffer()
@@ -179,25 +208,12 @@ DisplayDriver::GraphicsInterop HdCyclesDisplayDriver::graphics_interop_get()
 
 void HdCyclesDisplayDriver::graphics_interop_activate()
 {
-  mutex_.lock();
-
-#ifdef _WIN32
-  // Do not change context if this is called in the main thread
-  if (wglGetCurrentContext() == nullptr) {
-    TF_VERIFY(wglMakeCurrent((HDC)hdc_, (HGLRC)gl_context_));
-  }
-#endif
+  gl_context_enable();
 }
 
 void HdCyclesDisplayDriver::graphics_interop_deactivate()
 {
-#ifdef _WIN32
-  if (wglGetCurrentContext() == gl_context_) {
-    TF_VERIFY(wglMakeCurrent(nullptr, nullptr));
-  }
-#endif
-
-  mutex_.unlock();
+  gl_context_disable();
 }
 
 void HdCyclesDisplayDriver::clear()
@@ -214,7 +230,11 @@ void HdCyclesDisplayDriver::draw(const Params &params)
     return;
   }
 
-  init();
+  if (!renderBuffer->IsResourceUsed()) {
+    return;
+  }
+
+  gl_context_create();
 
   // Cycles 'DisplayDriver' only supports 'half4' format
   TF_VERIFY(renderBuffer->GetFormat() == HdFormatFloat16Vec4);
@@ -255,7 +275,6 @@ void HdCyclesDisplayDriver::draw(const Params &params)
   glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
 
   gl_render_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
-
   glFlush();
 
   need_update_ = false;
diff --git a/intern/cycles/hydra/display_driver.h b/intern/cycles/hydra/display_driver.h
index 20086830e6a..2a05397c325 100644
--- a/intern/cycles/hydra/display_driver.h
+++ b/intern/cycles/hydra/display_driver.h
@@ -19,9 +19,6 @@ class HdCyclesDisplayDriver final : public CCL_NS::DisplayDriver {
   ~HdCyclesDisplayDriver();
 
  private:
-  void init();
-  void deinit();
-
   void next_tile_begin() override;
 
   bool update_begin(const Params &params, int texture_width, int texture_height) override;
@@ -41,6 +38,11 @@ class HdCyclesDisplayDriver final : public CCL_NS::DisplayDriver {
 
   void draw(const Params &params) override;
 
+  void gl_context_create();
+  bool gl_context_enable();
+  void gl_context_disable();
+  void gl_context_dispose();
+
   HdCyclesSession *const _renderParam;
   Hgi *const _hgi;
 
@@ -48,7 +50,6 @@ class HdCyclesDisplayDriver final : public CCL_NS::DisplayDriver {
   void *hdc_ = nullptr;
   void *gl_context_ = nullptr;
 #endif
-
   CCL_NS::thread_mutex mutex_;
 
   PXR_NS::HgiTextureHandle texture_;
diff --git a/intern/cycles/hydra/output_driver.cpp b/intern/cycles/hydra/output_driver.cpp
index c5f64ac1c18..f4ea853f243 100644
--- a/intern/cycles/hydra/output_driver.cpp
+++ b/intern/cycles/hydra/output_driver.cpp
@@ -30,11 +30,11 @@ bool HdCyclesOutputDriver::update_render_tile(const Tile &tile)
   std::vector<float> pixels;
 
   for (const HdRenderPassAovBinding &aovBinding : _renderParam->GetAovBindings()) {
-    if (aovBinding == _renderParam->GetDisplayAovBinding()) {
-      continue;  // Display AOV binding is already updated by Cycles display driver
-    }
-
     if (const auto renderBuffer = static_cast<HdCyclesRenderBuffer *>(aovBinding.renderBuffer)) {
+      if (aovBinding == _renderParam->GetDisplayAovBinding() && renderBuffer->IsResourceUsed()) {
+        continue;  // Display AOV binding is already updated by Cycles display driver
+      }
+
       const HdFormat format = renderBuffer->GetFormat();
       if (format == HdFormatInvalid) {
         continue;  // Skip invalid AOV bindings
diff --git a/intern/cycles/hydra/render_buffer.cpp b/intern/cycles/hydra/render_buffer.cpp
index 4867def0624..4d8b21d1e61 100644
--- a/intern/cycles/hydra/render_buffer.cpp
+++ b/intern/cycles/hydra/render_buffer.cpp
@@ -35,7 +35,7 @@ bool HdCyclesRenderBuffer::Allocate(const GfVec3i &dimensions, HdFormat format,
     return false;
   }
 
-  const size_t oldSize = _data.size();
+  const size_t oldSize = _dataSize;
   const size_t newSize = dimensions[0] * dimensions[1] * HdDataSizeOfFormat(format);
   if (oldSize == newSize) {
     return true;
@@ -49,8 +49,8 @@ bool HdCyclesRenderBuffer::Allocate(const GfVec3i &dimensions, HdFormat format,
   _width = dimensions[0];
   _height = dimensions[1];
   _format = format;
-
-  _data.resize(newSize);
+  _dataSize = newSize;
+  _resourceUsed = false;
 
   return true;
 }
@@ -63,6 +63,7 @@ void HdCyclesRenderBuffer::_Deallocate()
 
   _data.clear();
   _data.shrink_to_fit();
+  _dataSize = 0;
 
   _resource = VtValue();
 }
@@ -74,6 +75,10 @@ void *HdCyclesRenderBuffer::Map()
     return nullptr;
   }
 
+  if (_data.size() != _dataSize) {
+    _data.resize(_dataSize);
+  }
+
   ++_mapped;
 
   return _data.data();
@@ -103,10 +108,17 @@ void HdCyclesRenderBuffer::SetConverged(bool converged)
   _converged = converged;
 }
 
+bool HdCyclesRenderBuffer::IsResourceUsed() const
+{
+  return _resourceUsed;
+}
+
 VtValue HdCyclesRenderBuffer::GetResource(bool multiSampled) const
 {
   TF_UNUSED(multiSampled);
 
+  _resourceUsed = true;
+
   return _resource;
 }
 
diff --git a/intern/cycles/hydra/render_buffer.h b/intern/cycles/hydra/render_buffer.h
index 8eb874f0068..90629d4aee0 100644
--- a/intern/cycles/hydra/render_buffer.h
+++ b/intern/cycles/hydra/render_buffer.h
@@ -58,6 +58,8 @@ class HdCyclesRenderBuffer final : public PXR_NS::HdRenderBuffer {
 
   void SetConverged(bool converged);
 
+  bool IsResourceUsed() const;
+
   PXR_NS::VtValue GetResource(bool multiSampled = false) const override;
 
   void SetResource(const PXR_NS::VtValue &resource);
@@ -74,9 +76,11 @@ class HdCyclesRenderBuffer final : public PXR_NS::HdRenderBuffer {
   unsigned int _width = 0u;
   unsigned int _height = 0u;
   PXR_NS::HdFormat _format = PXR_NS::HdFormatInvalid;
+  size_t _dataSize = 0;
 
   std::vector<uint8_t> _data;
   PXR_NS::VtValue _resource;
+  mutable std::atomic_bool _resourceUsed = false;
 
   std::atomic_int _mapped = 0;
   std::atomic_bool _converged = false;
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 8306460d607..ede81705ae8 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -65,6 +65,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
       integrator_shader_raytrace_sort_counter_(
           device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+      integrator_shader_mnee_sort_counter_(
+          device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE),
       integrator_shader_sort_prefix_sum_(
           device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
       integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
@@ -188,6 +190,9 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
     integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
     integrator_shader_raytrace_sort_counter_.zero_to_device();
 
+    integrator_shader_mnee_sort_counter_.alloc(max_shaders);
+    integrator_shader_mnee_sort_counter_.zero_to_device();
+
     integrator_shader_sort_prefix_sum_.alloc(max_shaders);
     integrator_shader_sort_prefix_sum_.zero_to_device();
 
@@ -195,6 +200,8 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
         (int *)integrator_shader_sort_counter_.device_pointer;
     integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
         (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
+        (int *)integrator_shader_mnee_sort_counter_.device_pointer;
   }
 }
 
@@ -327,6 +334,7 @@ void PathTraceWorkGPU::enqueue_reset()
   queue_->zero_to_device(integrator_queue_counter_);
   queue_->zero_to_device(integrator_shader_sort_counter_);
   queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+  queue_->zero_to_device(integrator_shader_mnee_sort_counter_);
 
   /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
    * counter on the host side because `zero_to_device()` is not doing it. */
@@ -450,6 +458,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
     case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
     case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
     case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
     case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
       /* Shading kernels with integrator state and render buffer. */
       DeviceKernelArguments args(&d_path_index, &buffers_->buffer.device_pointer, &work_size);
@@ -1080,13 +1089,15 @@ int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
 bool PathTraceWorkGPU::kernel_uses_sorting(DeviceKernel kernel)
 {
   return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE);
 }
 
 bool PathTraceWorkGPU::kernel_creates_shadow_paths(DeviceKernel kernel)
 {
   return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
           kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
           kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
 }
 
@@ -1094,7 +1105,8 @@ bool PathTraceWorkGPU::kernel_creates_ao_paths(DeviceKernel kernel)
 {
   return (device_scene_->data.kernel_features & KERNEL_FEATURE_AO) &&
          (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE);
 }
 
 bool PathTraceWorkGPU::kernel_is_shadow_path(DeviceKernel kernel)
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index 90f8b8a4509..4c10a221a30 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -133,6 +133,7 @@ class PathTraceWorkGPU : public PathTraceWork {
   /* Shader sorting. */
   device_vector<int> integrator_shader_sort_counter_;
   device_vector<int> integrator_shader_raytrace_sort_counter_;
+  device_vector<int> integrator_shader_mnee_sort_counter_;
   device_vector<int> integrator_shader_sort_prefix_sum_;
   /* Path split. */
   device_vector<int> integrator_next_main_path_index_;
diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h
index 3b714a3e580..7809ec5f4a7 100644
--- a/intern/cycles/kernel/device/cpu/image.h
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -817,6 +817,16 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
     }
     case IMAGE_DATA_TYPE_NANOVDB_FLOAT3:
       return NanoVDBInterpolator<nanovdb::Vec3f>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_NANOVDB_FPN: {
+      const float f = NanoVDBInterpolator<nanovdb::FpN, float>::interp_3d(
+          info, P.x, P.y, P.z, interp);
+      return make_float4(f, f, f, 1.0f);
+    }
+    case IMAGE_DATA_TYPE_NANOVDB_FP16: {
+      const float f = NanoVDBInterpolator<nanovdb::Fp16, float>::interp_3d(
+          info, P.x, P.y, P.z, interp);
+      return make_float4(f, f, f, 1.0f);
+    }
 #endif
     default:
       assert(0);
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
index c5bc7d88e02..29d851ae478 100644
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -125,7 +125,8 @@ kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, fl
 
 #ifdef WITH_NANOVDB
 template<typename T, typename S>
-ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, float z)
+ccl_device typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_tricubic_nanovdb(
+    S &s, float x, float y, float z)
 {
   float px = floorf(x);
   float py = floorf(y);
@@ -157,7 +158,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl
 }
 
 template<typename T>
-ccl_device_noinline T kernel_tex_image_interp_nanovdb(
+ccl_device_noinline typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_nanovdb(
     ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
 {
   using namespace nanovdb;
@@ -238,6 +239,14 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
         info, x, y, z, interpolation);
     return make_float4(f[0], f[1], f[2], 1.0f);
   }
+  if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FPN) {
+    float f = kernel_tex_image_interp_nanovdb<nanovdb::FpN>(info, x, y, z, interpolation);
+    return make_float4(f, f, f, 1.0f);
+  }
+  if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FP16) {
+    float f = kernel_tex_image_interp_nanovdb<nanovdb::Fp16>(info, x, y, z, interpolation);
+    return make_float4(f, f, f, 1.0f);
+  }
 #endif
   if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 ||
       texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) {
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 39f0d3c9cd0..b9a44ccad02 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -245,7 +245,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 ccl_gpu_kernel_postfix
 
-#ifdef __KERNEL_METAL__
+#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
 constant int __dummy_constant [[function_constant(0)]];
 #endif
 
@@ -260,7 +260,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
 
-#ifdef __KERNEL_METAL__
+#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
     KernelGlobals kg = NULL;
     /* Workaround Ambient Occlusion and Bevel nodes not working with Metal.
      * Dummy offset should not affect result, but somehow fixes bug! */
@@ -274,6 +274,21 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 ccl_gpu_kernel_postfix
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    ccl_gpu_kernel_signature(integrator_shade_surface_mnee,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    ccl_gpu_kernel_call(integrator_shade_surface_mnee(NULL, state, render_buffer));
+  }
+}
+ccl_gpu_kernel_postfix
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     ccl_gpu_kernel_signature(integrator_shade_volume,
                              ccl_global const int *path_index_array,
                              ccl_global float *render_buffer,
@@ -651,8 +666,9 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb
     const int x = render_pixel_index % width; \
     const int y = render_pixel_index / width; \
 \
-    ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
-                                     y * stride * kfilm_convert.pass_stride; \
+    const uint64_t buffer_pixel_index = x + y * stride; \
+    ccl_global const float *buffer = render_buffer + offset + \
+                                     buffer_pixel_index * kfilm_convert.pass_stride; \
 \
     ccl_global float *pixel = pixels + \
                               (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
@@ -681,8 +697,9 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb
     const int x = render_pixel_index % width; \
     const int y = render_pixel_index / width; \
 \
-    ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
-                                     y * stride * kfilm_convert.pass_stride; \
+    const uint64_t buffer_pixel_index = x + y * stride; \
+    ccl_global const float *buffer = render_buffer + offset + \
+                                     buffer_pixel_index * kfilm_convert.pass_stride; \
 \
     float pixel[4]; \
     film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
diff --git a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
index e2c5d2ff024..3bd57bc0f1a 100644
--- a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
+++ b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
@@ -15,3 +15,11 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_raytr
                                                        global_index;
   integrator_shade_surface_raytrace(nullptr, path_index, __params.render_buffer);
 }
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_mnee()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_shade_surface_mnee(nullptr, path_index, __params.render_buffer);
+}
diff --git a/intern/cycles/kernel/film/adaptive_sampling.h b/intern/cycles/kernel/film/adaptive_sampling.h
index ad9b3b08ac5..16867c39d99 100644
--- a/intern/cycles/kernel/film/adaptive_sampling.h
+++ b/intern/cycles/kernel/film/adaptive_sampling.h
@@ -91,13 +91,13 @@ ccl_device void kernel_adaptive_sampling_filter_x(KernelGlobals kg,
   bool prev = false;
   for (int x = start_x; x < start_x + width; ++x) {
     int index = offset + x + y * stride;
-    ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+    ccl_global float *buffer = render_buffer + (uint64_t)index * kernel_data.film.pass_stride;
     const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
 
     if (buffer[aux_w_offset] == 0.0f) {
       if (x > start_x && !prev) {
         index = index - 1;
-        buffer = render_buffer + index * kernel_data.film.pass_stride;
+        buffer = render_buffer + (uint64_t)index * kernel_data.film.pass_stride;
         buffer[aux_w_offset] = 0.0f;
       }
       prev = true;
@@ -124,13 +124,13 @@ ccl_device void kernel_adaptive_sampling_filter_y(KernelGlobals kg,
   bool prev = false;
   for (int y = start_y; y < start_y + height; ++y) {
     int index = offset + x + y * stride;
-    ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+    ccl_global float *buffer = render_buffer + (uint64_t)index * kernel_data.film.pass_stride;
     const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
 
     if (buffer[aux_w_offset] == 0.0f) {
       if (y > start_y && !prev) {
         index = index - stride;
-        buffer = render_buffer + index * kernel_data.film.pass_stride;
+        buffer = render_buffer + (uint64_t)index * kernel_data.film.pass_stride;
         buffer[aux_w_offset] = 0.0f;
       }
       prev = true;
diff --git a/intern/cycles/kernel/integrator/init_from_bake.h b/intern/cycles/kernel/integrator/init_from_bake.h
index 3772db845a8..0db4241b6e3 100644
--- a/intern/cycles/kernel/integrator/init_from_bake.h
+++ b/intern/cycles/kernel/integrator/init_from_bake.h
@@ -102,7 +102,7 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
   /* Setup render buffers. */
   const int index = INTEGRATOR_STATE(state, path, render_pixel_index);
   const int pass_stride = kernel_data.film.pass_stride;
-  ccl_global float *buffer = render_buffer + index * pass_stride;
+  ccl_global float *buffer = render_buffer + (uint64_t)index * pass_stride;
 
   ccl_global float *primitive = buffer + kernel_data.film.pass_bake_primitive;
   ccl_global float *differential = buffer + kernel_data.film.pass_bake_differential;
@@ -243,9 +243,12 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
     /* Setup next kernel to execute. */
     const bool use_caustics = kernel_data.integrator.use_caustics &&
                               (object_flag & SD_OBJECT_CAUSTICS);
-    const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE) || use_caustics;
+    const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE);
 
-    if (use_raytrace_kernel) {
+    if (use_caustics) {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader_index);
+    }
+    else if (use_raytrace_kernel) {
       INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader_index);
     }
     else {
diff --git a/intern/cycles/kernel/integrator/intersect_closest.h b/intern/cycles/kernel/integrator/intersect_closest.h
index b8ce625c11b..2dfac44b414 100644
--- a/intern/cycles/kernel/integrator/intersect_closest.h
+++ b/intern/cycles/kernel/integrator/intersect_closest.h
@@ -125,9 +125,12 @@ ccl_device_forceinline void integrator_split_shadow_catcher(
   const int flags = kernel_tex_fetch(__shaders, shader).flags;
   const bool use_caustics = kernel_data.integrator.use_caustics &&
                             (object_flags & SD_OBJECT_CAUSTICS);
-  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE) || use_caustics;
+  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
 
-  if (use_raytrace_kernel) {
+  if (use_caustics) {
+    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
+  }
+  else if (use_raytrace_kernel) {
     INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
   }
   else {
@@ -150,9 +153,13 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catche
   const int object_flags = intersection_get_object_flags(kg, &isect);
   const bool use_caustics = kernel_data.integrator.use_caustics &&
                             (object_flags & SD_OBJECT_CAUSTICS);
-  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE) || use_caustics;
+  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
 
-  if (use_raytrace_kernel) {
+  if (use_caustics) {
+    INTEGRATOR_PATH_NEXT_SORTED(
+        current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
+  }
+  else if (use_raytrace_kernel) {
     INTEGRATOR_PATH_NEXT_SORTED(
         current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
   }
@@ -222,8 +229,12 @@ ccl_device_forceinline void integrator_intersect_next_kernel(
         const int object_flags = intersection_get_object_flags(kg, isect);
         const bool use_caustics = kernel_data.integrator.use_caustics &&
                                   (object_flags & SD_OBJECT_CAUSTICS);
-        const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE) || use_caustics;
-        if (use_raytrace_kernel) {
+        const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
+        if (use_caustics) {
+          INTEGRATOR_PATH_NEXT_SORTED(
+              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
+        }
+        else if (use_raytrace_kernel) {
           INTEGRATOR_PATH_NEXT_SORTED(
               current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
         }
@@ -272,9 +283,13 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_volume(
       const int object_flags = intersection_get_object_flags(kg, isect);
       const bool use_caustics = kernel_data.integrator.use_caustics &&
                                 (object_flags & SD_OBJECT_CAUSTICS);
-      const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE) || use_caustics;
+      const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
 
-      if (use_raytrace_kernel) {
+      if (use_caustics) {
+        INTEGRATOR_PATH_NEXT_SORTED(
+            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
+      }
+      else if (use_raytrace_kernel) {
         INTEGRATOR_PATH_NEXT_SORTED(
             current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
       }
diff --git a/intern/cycles/kernel/integrator/megakernel.h b/intern/cycles/kernel/integrator/megakernel.h
index a0c15794470..17ae13ad23f 100644
--- a/intern/cycles/kernel/integrator/megakernel.h
+++ b/intern/cycles/kernel/integrator/megakernel.h
@@ -77,6 +77,9 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
         case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
           integrator_shade_surface_raytrace(kg, state, render_buffer);
           break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
+          integrator_shade_surface_mnee(kg, state, render_buffer);
+          break;
         case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
           integrator_shade_light(kg, state, render_buffer);
           break;
diff --git a/intern/cycles/kernel/integrator/mnee.h b/intern/cycles/kernel/integrator/mnee.h
index 2f7b711e28c..ad83f82d091 100644
--- a/intern/cycles/kernel/integrator/mnee.h
+++ b/intern/cycles/kernel/integrator/mnee.h
@@ -807,6 +807,15 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
    * and keep pdf in vertex area measure */
   mnee_update_light_sample(kg, vertices[vertex_count - 1].p, ls);
 
+  /* Save state path bounce info in case a light path node is used in the refractive interface or
+   * light shader graph. */
+  const int transmission_bounce = INTEGRATOR_STATE(state, path, transmission_bounce);
+  const int diffuse_bounce = INTEGRATOR_STATE(state, path, diffuse_bounce);
+  const int bounce = INTEGRATOR_STATE(state, path, bounce);
+
+  /* Set diffuse bounce info . */
+  INTEGRATOR_STATE_WRITE(state, path, diffuse_bounce) = diffuse_bounce + 1;
+
   /* Evaluate light sample
    * in case the light has a node-based shader:
    * 1. sd_mnee will be used to store light data, which is why we need to do
@@ -814,6 +823,12 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
    *    interface data at the end of the call for the shadow ray setup to work.
    * 2. ls needs to contain the last interface data for the light shader to
    *    evaluate properly */
+
+  /* Set bounce info in case a light path node is used in the light shader graph. */
+  INTEGRATOR_STATE_WRITE(state, path, transmission_bounce) = transmission_bounce + vertex_count -
+                                                             1;
+  INTEGRATOR_STATE_WRITE(state, path, bounce) = bounce + vertex_count;
+
   float3 light_eval = light_sample_shader_eval(kg, state, sd_mnee, ls, sd->time);
   bsdf_eval_mul3(throughput, light_eval / ls->pdf);
 
@@ -885,6 +900,11 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
                              false,
                              LAMP_NONE);
 
+    /* Set bounce info in case a light path node is used in the refractive interface
+     * shader graph. */
+    INTEGRATOR_STATE_WRITE(state, path, transmission_bounce) = transmission_bounce + vi;
+    INTEGRATOR_STATE_WRITE(state, path, bounce) = bounce + 1 + vi;
+
     /* Evaluate shader nodes at solution vi. */
     shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
         kg, state, sd_mnee, NULL, PATH_RAY_DIFFUSE, true);
@@ -901,6 +921,11 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
     bsdf_eval_mul3(throughput, bsdf_contribution);
   }
 
+  /* Restore original state path bounce info. */
+  INTEGRATOR_STATE_WRITE(state, path, transmission_bounce) = transmission_bounce;
+  INTEGRATOR_STATE_WRITE(state, path, diffuse_bounce) = diffuse_bounce;
+  INTEGRATOR_STATE_WRITE(state, path, bounce) = bounce;
+
   return true;
 }
 
@@ -1029,18 +1054,17 @@ ccl_device_forceinline int kernel_path_mnee_sample(KernelGlobals kg,
     return 0;
 
   /* Check whether the transmission depth limit is reached before continuing. */
-  if (INTEGRATOR_STATE(state, path, transmission_bounce) + vertex_count >=
+  if ((INTEGRATOR_STATE(state, path, transmission_bounce) + vertex_count - 1) >=
       kernel_data.integrator.max_transmission_bounce)
     return 0;
 
   /* Check whether the diffuse depth limit is reached before continuing. */
-  if (INTEGRATOR_STATE(state, path, diffuse_bounce) + 1 >=
+  if ((INTEGRATOR_STATE(state, path, diffuse_bounce) + 1) >=
       kernel_data.integrator.max_diffuse_bounce)
     return 0;
 
   /* Check whether the overall depth limit is reached before continuing. */
-  if (INTEGRATOR_STATE(state, path, bounce) + 1 + vertex_count >=
-      kernel_data.integrator.max_bounce)
+  if ((INTEGRATOR_STATE(state, path, bounce) + vertex_count) >= kernel_data.integrator.max_bounce)
     return 0;
 
   /* Mark the manifold walk valid to turn off mollification regardless of how successful the walk
diff --git a/intern/cycles/kernel/integrator/shade_background.h b/intern/cycles/kernel/integrator/shade_background.h
index 62b3ce1c15c..72ecf67e8a0 100644
--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -48,7 +48,7 @@ ccl_device float3 integrator_eval_background_shader(KernelGlobals kg,
 
     PROFILING_SHADER(emission_sd->object, emission_sd->shader);
     PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL);
-    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_BACKGROUND>(
         kg, state, emission_sd, render_buffer, path_flag | PATH_RAY_EMISSION);
 
     L = shader_background_eval(emission_sd);
diff --git a/intern/cycles/kernel/integrator/shade_surface.h b/intern/cycles/kernel/integrator/shade_surface.h
index 859c314b088..ce1398859b7 100644
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -137,7 +137,7 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
 
 #  ifdef __MNEE__
   int mnee_vertex_count = 0;
-  IF_KERNEL_NODES_FEATURE(RAYTRACE)
+  IF_KERNEL_FEATURE(MNEE)
   {
     if (ls.lamp != LAMP_NONE) {
       /* Is this a caustic light? */
@@ -253,13 +253,13 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
 #  ifdef __MNEE__
   if (mnee_vertex_count > 0) {
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transmission_bounce) =
-        INTEGRATOR_STATE(state, path, transmission_bounce) + mnee_vertex_count;
+        INTEGRATOR_STATE(state, path, transmission_bounce) + mnee_vertex_count - 1;
     INTEGRATOR_STATE_WRITE(shadow_state,
                            shadow_path,
                            diffuse_bounce) = INTEGRATOR_STATE(state, path, diffuse_bounce) + 1;
     INTEGRATOR_STATE_WRITE(shadow_state,
                            shadow_path,
-                           bounce) = INTEGRATOR_STATE(state, path, bounce) + 1 + mnee_vertex_count;
+                           bounce) = INTEGRATOR_STATE(state, path, bounce) + mnee_vertex_count;
   }
   else
 #  endif
@@ -631,4 +631,12 @@ ccl_device_forceinline void integrator_shade_surface_raytrace(
       kg, state, render_buffer);
 }
 
+ccl_device_forceinline void integrator_shade_surface_mnee(
+    KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
+{
+  integrator_shade_surface<(KERNEL_FEATURE_NODE_MASK_SURFACE & ~KERNEL_FEATURE_NODE_RAYTRACE) |
+                               KERNEL_FEATURE_MNEE,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE>(kg, state, render_buffer);
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/subsurface.h b/intern/cycles/kernel/integrator/subsurface.h
index 2391cc2356d..b449f807290 100644
--- a/intern/cycles/kernel/integrator/subsurface.h
+++ b/intern/cycles/kernel/integrator/subsurface.h
@@ -174,9 +174,14 @@ ccl_device_inline bool subsurface_scatter(KernelGlobals kg, IntegratorState stat
   const int object_flags = intersection_get_object_flags(kg, &ss_isect.hits[0]);
   const bool use_caustics = kernel_data.integrator.use_caustics &&
                             (object_flags & SD_OBJECT_CAUSTICS);
-  const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE) || use_caustics;
+  const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE);
 
-  if (use_raytrace_kernel) {
+  if (use_caustics) {
+    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+                                DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE,
+                                shader);
+  }
+  else if (use_raytrace_kernel) {
     INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
                                 DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
                                 shader);
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index dbbce655ef7..39fcee1ac0d 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -17,6 +17,7 @@
 #include "kernel/types.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/bsdf_diffuse_ramp.h"
+#include "kernel/closure/bsdf_util.h"
 // clang-format on
 
 CCL_NAMESPACE_BEGIN
@@ -30,6 +31,8 @@ class DiffuseRampClosure : public CBSDFClosure {
 
   void setup(ShaderData *sd, uint32_t /* path_flag */, float3 weight)
   {
+    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+
     DiffuseRampBsdf *bsdf = (DiffuseRampBsdf *)bsdf_alloc_osl(
         sd, sizeof(DiffuseRampBsdf), weight, &params);
 
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index fd602c8847e..972ed7e4a6d 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -17,6 +17,7 @@
 #include "kernel/types.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/bsdf_phong_ramp.h"
+#include "kernel/closure/bsdf_util.h"
 // clang-format on
 
 CCL_NAMESPACE_BEGIN
@@ -30,6 +31,8 @@ class PhongRampClosure : public CBSDFClosure {
 
   void setup(ShaderData *sd, uint32_t /* path_flag */, float3 weight)
   {
+    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+
     PhongRampBsdf *bsdf = (PhongRampBsdf *)bsdf_alloc_osl(
         sd, sizeof(PhongRampBsdf), weight, &params);
 
diff --git a/intern/cycles/kernel/osl/bssrdf.cpp b/intern/cycles/kernel/osl/bssrdf.cpp
index 29d2d93ce34..4b282fddad3 100644
--- a/intern/cycles/kernel/osl/bssrdf.cpp
+++ b/intern/cycles/kernel/osl/bssrdf.cpp
@@ -44,6 +44,8 @@ class CBSSRDFClosure : public CClosurePrimitive {
 
   void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
   {
+    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+
     if (method == u_burley) {
       alloc(sd, path_flag, weight, CLOSURE_BSSRDF_BURLEY_ID);
     }
diff --git a/intern/cycles/kernel/osl/closures.cpp b/intern/cycles/kernel/osl/closures.cpp
index 06b91ab9bb7..7c6b48154e4 100644
--- a/intern/cycles/kernel/osl/closures.cpp
+++ b/intern/cycles/kernel/osl/closures.cpp
@@ -180,6 +180,8 @@ class PrincipledSheenClosure : public CBSDFClosure {
   void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
   {
     if (!skip(sd, path_flag, LABEL_DIFFUSE)) {
+      params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+
       PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf *)bsdf_alloc_osl(
           sd, sizeof(PrincipledSheenBsdf), weight, &params);
       sd->flag |= (bsdf) ? bsdf_principled_sheen_setup(sd, bsdf) : 0;
@@ -223,6 +225,8 @@ class PrincipledHairClosure : public CBSDFClosure {
   void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
   {
     if (!skip(sd, path_flag, LABEL_GLOSSY)) {
+      params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+
       PrincipledHairBSDF *bsdf = (PrincipledHairBSDF *)alloc(sd, path_flag, weight);
       if (!bsdf) {
         return;
@@ -282,6 +286,7 @@ class PrincipledClearcoatClosure : public CBSDFClosure {
 
   void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
   {
+    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
     MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
     if (!bsdf) {
       return;
@@ -503,6 +508,8 @@ class MicrofacetClosure : public CBSDFClosure {
       return;
     }
 
+    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+
     MicrofacetBsdf *bsdf = (MicrofacetBsdf *)bsdf_alloc_osl(
         sd, sizeof(MicrofacetBsdf), weight, &params);
 
@@ -601,6 +608,8 @@ class MicrofacetGGXFresnelClosure : public MicrofacetFresnelClosure {
  public:
   void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
   {
+    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+
     MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
     if (!bsdf) {
       return;
@@ -630,6 +639,8 @@ class MicrofacetGGXAnisoFresnelClosure : public MicrofacetFresnelClosure {
  public:
   void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
   {
+    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+
     MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
     if (!bsdf) {
       return;
@@ -695,6 +706,8 @@ class MicrofacetMultiGGXClosure : public MicrofacetMultiClosure {
  public:
   void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
   {
+    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+
     MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
     if (!bsdf) {
       return;
@@ -723,6 +736,8 @@ class MicrofacetMultiGGXAnisoClosure : public MicrofacetMultiClosure {
  public:
   void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
   {
+    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+
     MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
     if (!bsdf) {
       return;
@@ -755,6 +770,8 @@ class MicrofacetMultiGGXGlassClosure : public MicrofacetMultiClosure {
 
   void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
   {
+    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+
     MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
     if (!bsdf) {
       return;
@@ -819,6 +836,8 @@ class MicrofacetMultiGGXFresnelClosure : public MicrofacetMultiFresnelClosure {
  public:
   void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
   {
+    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+
     MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
     if (!bsdf) {
       return;
@@ -849,6 +868,8 @@ class MicrofacetMultiGGXAnisoFresnelClosure : public MicrofacetMultiFresnelClosu
  public:
   void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
   {
+    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+
     MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
     if (!bsdf) {
       return;
@@ -883,6 +904,8 @@ class MicrofacetMultiGGXGlassFresnelClosure : public MicrofacetMultiFresnelClosu
 
   void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
   {
+    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+
     MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
     if (!bsdf) {
       return;
diff --git a/intern/cycles/kernel/osl/closures.h b/intern/cycles/kernel/osl/closures.h
index 6225644a8e6..e10a3d88a04 100644
--- a/intern/cycles/kernel/osl/closures.h
+++ b/intern/cycles/kernel/osl/closures.h
@@ -114,6 +114,7 @@ class CBSDFClosure : public CClosurePrimitive {
     void setup(ShaderData *sd, uint32_t path_flag, float3 weight) \
     { \
       if (!skip(sd, path_flag, TYPE)) { \
+        params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); \
         structname *bsdf = (structname *)bsdf_alloc_osl(sd, sizeof(structname), weight, &params); \
         sd->flag |= (bsdf) ? bsdf_##lower##_setup(bsdf) : 0; \
       } \
diff --git a/intern/cycles/kernel/svm/blackbody.h b/intern/cycles/kernel/svm/blackbody.h
index af59c2fe747..774fa16b384 100644
--- a/intern/cycles/kernel/svm/blackbody.h
+++ b/intern/cycles/kernel/svm/blackbody.h
@@ -24,6 +24,7 @@ ccl_device_noinline void svm_node_blackbody(KernelGlobals kg,
   float temperature = stack_load_float(stack, temperature_offset);
 
   float3 color_rgb = rec709_to_rgb(kg, svm_math_blackbody_color_rec709(temperature));
+  color_rgb = max(color_rgb, zero_float3());
 
   stack_store_float3(stack, col_offset, color_rgb);
 }
diff --git a/intern/cycles/kernel/svm/displace.h b/intern/cycles/kernel/svm/displace.h
index 56fb5c3f9b0..128023263fd 100644
--- a/intern/cycles/kernel/svm/displace.h
+++ b/intern/cycles/kernel/svm/displace.h
@@ -188,6 +188,7 @@ ccl_device_noinline int svm_node_vector_displacement(
   else
   {
     stack_store_float3(stack, displacement_offset, zero_float3());
+    (void)data_node;
   }
 
   return offset;
diff --git a/intern/cycles/kernel/svm/math_util.h b/intern/cycles/kernel/svm/math_util.h
index 9f2d9561e26..89bd4a501a7 100644
--- a/intern/cycles/kernel/svm/math_util.h
+++ b/intern/cycles/kernel/svm/math_util.h
@@ -192,28 +192,26 @@ ccl_device float svm_math(NodeMathType type, float a, float b, float c)
 ccl_device float3 svm_math_blackbody_color_rec709(float t)
 {
   /* Calculate color in range 800..12000 using an approximation
-   * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B
-   * Max absolute error for RGB is (0.00095, 0.00077, 0.00057),
-   * which is enough to get the same 8 bit/channel color.
-   */
+   * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B.
+   *
+   * The result of this can be negative to support gamut wider than
+   * than rec.709, just needs to be clamped. */
 
   if (t >= 12000.0f) {
-    return make_float3(0.826270103f, 0.994478524f, 1.56626022f);
+    return make_float3(0.8262954810464208f, 0.9945080501520986f, 1.566307710274283f);
   }
-  else if (t < 965.0f) {
-    /* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */
-    return make_float3(4.70366907f, 0.0f, 0.0f);
+  else if (t < 800.0f) {
+    /* Arbitrary lower limit where light is very dim, matching OSL. */
+    return make_float3(5.413294490189271f, -0.20319390035873933f, -0.0822535242887164f);
   }
 
-  /* Manually align for readability. */
-  /* clang-format off */
-  int i = (t >= 6365.0f) ? 5 :
-          (t >= 3315.0f) ? 4 :
-          (t >= 1902.0f) ? 3 :
-          (t >= 1449.0f) ? 2 :
-          (t >= 1167.0f) ? 1 :
+  int i = (t >= 6365.0f) ? 6 :
+          (t >= 3315.0f) ? 5 :
+          (t >= 1902.0f) ? 4 :
+          (t >= 1449.0f) ? 3 :
+          (t >= 1167.0f) ? 2 :
+          (t >= 965.0f)  ? 1 :
                            0;
-  /* clang-format on */
 
   ccl_constant float *r = blackbody_table_r[i];
   ccl_constant float *g = blackbody_table_g[i];
diff --git a/intern/cycles/kernel/tables.h b/intern/cycles/kernel/tables.h
index f826cc5c5ef..c1fdbba3fa7 100644
--- a/intern/cycles/kernel/tables.h
+++ b/intern/cycles/kernel/tables.h
@@ -4,30 +4,33 @@
 /* clang-format off */
 
 ccl_inline_constant float blackbody_table_r[][3] = {
-  {2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f},
-  {3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f},
-  {4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f},
-  {4.66849800e+03f, 2.85655028e-05f, 1.29075375e-01f},
-  {4.60124770e+03f, 2.89727618e-05f, 1.48001316e-01f},
-  {3.78765709e+03f, 9.36026367e-06f, 3.98995841e-01f}
+ {1.61919106e+03f, -2.05010916e-03f, 5.02995757e+00f},
+ {2.48845471e+03f, -1.11330907e-03f, 3.22621544e+00f},
+ {3.34143193e+03f, -4.86551192e-04f, 1.76486769e+00f},
+ {4.09461742e+03f, -1.27446582e-04f, 7.25731635e-01f},
+ {4.67028036e+03f, 2.91258199e-05f, 1.26703442e-01f},
+ {4.59509185e+03f, 2.87495649e-05f, 1.50345020e-01f},
+ {3.78717450e+03f, 9.35907826e-06f, 3.99075871e-01f}
 };
 
 ccl_inline_constant float blackbody_table_g[][3] = {
-  {-7.50343014e+02f, 3.15679613e-04f, 4.73464526e-01f},
-  {-1.00402363e+03f, 1.29189794e-04f, 9.08181524e-01f},
-  {-1.22075471e+03f, 2.56245413e-05f, 1.20753416e+00f},
-  {-1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f},
-  {-1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f},
-  {-5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f}
+ {-4.88999748e+02f, 6.04330754e-04f, -7.55807526e-02f},
+ {-7.55994277e+02f, 3.16730098e-04f, 4.78306139e-01f},
+ {-1.02363977e+03f, 1.20223470e-04f, 9.36662319e-01f},
+ {-1.26571316e+03f, 4.87340896e-06f, 1.27054498e+00f},
+ {-1.42529332e+03f, -4.01150431e-05f, 1.43972784e+00f},
+ {-1.17554822e+03f, -2.16378048e-05f, 1.30408023e+00f},
+ {-5.00799571e+02f, -4.59832026e-06f, 1.09098763e+00f}
 };
 
 ccl_inline_constant float blackbody_table_b[][4] = {
-  {0.0f, 0.0f, 0.0f, 0.0f}, /* zeros should be optimized by compiler */
-  {0.0f, 0.0f, 0.0f, 0.0f},
-  {0.0f, 0.0f, 0.0f, 0.0f},
-  {-2.02524603e-11f, 1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f},
-  {-2.22463426e-13f, -1.55078698e-08f, 3.81675160e-04f, -7.30646033e-01f},
-  {6.72595954e-13f, -2.73059993e-08f, 4.24068546e-04f, -7.52204323e-01f}
+ {5.96945309e-11f, -4.85742887e-08f, -9.70622247e-05f, -4.07936148e-03f},
+ {2.40430366e-11f, 5.55021075e-08f, -1.98503712e-04f, 2.89312858e-02f},
+ {-1.40949732e-11f, 1.89878968e-07f, -3.56632824e-04f, 9.10767778e-02f},
+ {-3.61460868e-11f, 2.84822009e-07f, -4.93211319e-04f, 1.56723440e-01f},
+ {-1.97075738e-11f, 1.75359352e-07f, -2.50542825e-04f, -2.22783266e-02f},
+ {-1.61997957e-13f, -1.64216008e-08f, 3.86216271e-04f, -7.38077418e-01f},
+ {6.72650283e-13f, -2.73078809e-08f, 4.24098264e-04f, -7.52335691e-01f}
 };
 
 ccl_inline_constant float cie_colour_match[][3] = {
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index 297e88c718c..a76effdd952 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -1572,6 +1572,7 @@ typedef enum DeviceKernel : int {
   DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT,
   DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
   DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE,
   DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
   DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW,
   DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL,
@@ -1689,6 +1690,9 @@ enum KernelFeatureFlag : uint32_t {
   KERNEL_FEATURE_AO_PASS = (1U << 25U),
   KERNEL_FEATURE_AO_ADDITIVE = (1U << 26U),
   KERNEL_FEATURE_AO = (KERNEL_FEATURE_AO_PASS | KERNEL_FEATURE_AO_ADDITIVE),
+
+  /* MNEE. */
+  KERNEL_FEATURE_MNEE = (1U << 27U),
 };
 
 /* Shader node feature mask, to specialize shader evaluation for kernels. */
@@ -1696,6 +1700,8 @@ enum KernelFeatureFlag : uint32_t {
 #define KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT \
   (KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VORONOI_EXTRA | \
    KERNEL_FEATURE_NODE_LIGHT_PATH)
+#define KERNEL_FEATURE_NODE_MASK_SURFACE_BACKGROUND \
+  (KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT | KERNEL_FEATURE_NODE_AOV)
 #define KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW \
   (KERNEL_FEATURE_NODE_BSDF | KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VOLUME | \
    KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE | \
@@ -1714,9 +1720,12 @@ enum KernelFeatureFlag : uint32_t {
  * are different depending on the main, shadow or null path. For GPU we don't have
  * C++17 everywhere so can't use it. */
 #ifdef __KERNEL_CPU__
+#  define IF_KERNEL_FEATURE(feature) \
+    if constexpr ((node_feature_mask & (KERNEL_FEATURE_##feature)) != 0U)
 #  define IF_KERNEL_NODES_FEATURE(feature) \
     if constexpr ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U)
 #else
+#  define IF_KERNEL_FEATURE(feature) if ((node_feature_mask & (KERNEL_FEATURE_##feature)) != 0U)
 #  define IF_KERNEL_NODES_FEATURE(feature) \
     if ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U)
 #endif
diff --git a/intern/cycles/scene/attribute.cpp b/intern/cycles/scene/attribute.cpp
index df01189a54b..d6b4c0240f6 100644
--- a/intern/cycles/scene/attribute.cpp
+++ b/intern/cycles/scene/attribute.cpp
@@ -661,6 +661,26 @@ Attribute *AttributeSet::find(AttributeStandard std) const
   return NULL;
 }
 
+Attribute *AttributeSet::find_matching(const Attribute &other)
+{
+  for (Attribute &attr : attributes) {
+    if (attr.name != other.name) {
+      continue;
+    }
+    if (attr.std != other.std) {
+      continue;
+    }
+    if (attr.type != other.type) {
+      continue;
+    }
+    if (attr.element != other.element) {
+      continue;
+    }
+    return &attr;
+  }
+  return nullptr;
+}
+
 void AttributeSet::remove(AttributeStandard std)
 {
   Attribute *attr = find(std);
@@ -729,32 +749,24 @@ void AttributeSet::clear(bool preserve_voxel_data)
 
 void AttributeSet::update(AttributeSet &&new_attributes)
 {
-  /* add or update old_attributes based on the new_attributes */
-  foreach (Attribute &attr, new_attributes.attributes) {
-    Attribute *nattr = add(attr.name, attr.type, attr.element);
-    nattr->std = attr.std;
-    nattr->set_data_from(std::move(attr));
-  }
-
-  /* remove any attributes not on new_attributes */
+  /* Remove any attributes not on new_attributes. */
   list<Attribute>::iterator it;
   for (it = attributes.begin(); it != attributes.end();) {
-    if (it->std != ATTR_STD_NONE) {
-      if (new_attributes.find(it->std) == nullptr) {
-        remove(it++);
-        continue;
-      }
-    }
-    else if (it->name != "") {
-      if (new_attributes.find(it->name) == nullptr) {
-        remove(it++);
-        continue;
-      }
+    const Attribute &old_attr = *it;
+    if (new_attributes.find_matching(old_attr) == nullptr) {
+      remove(it++);
+      continue;
     }
-
     it++;
   }
 
+  /* Add or update old_attributes based on the new_attributes. */
+  foreach (Attribute &attr, new_attributes.attributes) {
+    Attribute *nattr = add(attr.name, attr.type, attr.element);
+    nattr->std = attr.std;
+    nattr->set_data_from(std::move(attr));
+  }
+
   /* If all attributes were replaced, transform is no longer applied. */
   geometry->transform_applied = false;
 }
diff --git a/intern/cycles/scene/attribute.h b/intern/cycles/scene/attribute.h
index fd13b8ff6de..7f8cbf32049 100644
--- a/intern/cycles/scene/attribute.h
+++ b/intern/cycles/scene/attribute.h
@@ -194,6 +194,7 @@ class AttributeSet {
   void remove(AttributeStandard std);
 
   Attribute *find(AttributeRequest &req);
+  Attribute *find_matching(const Attribute &other);
 
   void remove(Attribute *attribute);
 
diff --git a/intern/cycles/scene/image.cpp b/intern/cycles/scene/image.cpp
index c61ad1f1d71..1b44162351a 100644
--- a/intern/cycles/scene/image.cpp
+++ b/intern/cycles/scene/image.cpp
@@ -64,6 +64,10 @@ const char *name_from_type(ImageDataType type)
       return "nanovdb_float";
     case IMAGE_DATA_TYPE_NANOVDB_FLOAT3:
       return "nanovdb_float3";
+    case IMAGE_DATA_TYPE_NANOVDB_FPN:
+      return "nanovdb_fpn";
+    case IMAGE_DATA_TYPE_NANOVDB_FP16:
+      return "nanovdb_fp16";
     case IMAGE_DATA_NUM_TYPES:
       assert(!"System enumerator type, should never be used");
       return "";
@@ -268,17 +272,12 @@ void ImageMetaData::detect_colorspace()
     compress_as_srgb = true;
   }
   else {
-    /* Always compress non-raw 8bit images as scene linear + sRGB, as a
-     * heuristic to keep memory usage the same without too much data loss
-     * due to quantization in common cases. */
-    compress_as_srgb = (type == IMAGE_DATA_TYPE_BYTE || type == IMAGE_DATA_TYPE_BYTE4);
-
     /* If colorspace conversion needed, use half instead of short so we can
      * represent HDR values that might result from conversion. */
-    if (type == IMAGE_DATA_TYPE_USHORT) {
+    if (type == IMAGE_DATA_TYPE_BYTE || type == IMAGE_DATA_TYPE_USHORT) {
       type = IMAGE_DATA_TYPE_HALF;
     }
-    else if (type == IMAGE_DATA_TYPE_USHORT4) {
+    else if (type == IMAGE_DATA_TYPE_BYTE4 || type == IMAGE_DATA_TYPE_USHORT4) {
       type = IMAGE_DATA_TYPE_HALF4;
     }
   }
@@ -378,7 +377,9 @@ void ImageManager::load_image_metadata(Image *img)
   metadata.detect_colorspace();
 
   assert(features.has_nanovdb || (metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT ||
-                                  metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3));
+                                  metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3 ||
+                                  metadata.type != IMAGE_DATA_TYPE_NANOVDB_FPN ||
+                                  metadata.type != IMAGE_DATA_TYPE_NANOVDB_FP16));
 
   img->need_metadata = false;
 }
@@ -796,7 +797,8 @@ void ImageManager::device_load_image(Device *device, Scene *scene, int slot, Pro
     }
   }
 #ifdef WITH_NANOVDB
-  else if (type == IMAGE_DATA_TYPE_NANOVDB_FLOAT || type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+  else if (type == IMAGE_DATA_TYPE_NANOVDB_FLOAT || type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3 ||
+           type == IMAGE_DATA_TYPE_NANOVDB_FPN || type == IMAGE_DATA_TYPE_NANOVDB_FP16) {
     thread_scoped_lock device_lock(device_mutex);
     void *pixels = img->mem->alloc(img->metadata.byte_size, 0);
 
diff --git a/intern/cycles/scene/image_oiio.cpp b/intern/cycles/scene/image_oiio.cpp
index 3f825afbe90..09676455308 100644
--- a/intern/cycles/scene/image_oiio.cpp
+++ b/intern/cycles/scene/image_oiio.cpp
@@ -94,10 +94,11 @@ bool OIIOImageLoader::load_metadata(const ImageDeviceFeatures & /*features*/,
 template<TypeDesc::BASETYPE FileFormat, typename StorageType>
 static void oiio_load_pixels(const ImageMetaData &metadata,
                              const unique_ptr<ImageInput> &in,
+                             const bool associate_alpha,
                              StorageType *pixels)
 {
-  const int width = metadata.width;
-  const int height = metadata.height;
+  const size_t width = metadata.width;
+  const size_t height = metadata.height;
   const int depth = metadata.depth;
   const int components = metadata.channels;
 
@@ -105,12 +106,12 @@ static void oiio_load_pixels(const ImageMetaData &metadata,
   StorageType *readpixels = pixels;
   vector<StorageType> tmppixels;
   if (components > 4) {
-    tmppixels.resize(((size_t)width) * height * components);
+    tmppixels.resize(width * height * components);
     readpixels = &tmppixels[0];
   }
 
   if (depth <= 1) {
-    size_t scanlinesize = ((size_t)width) * components * sizeof(StorageType);
+    size_t scanlinesize = width * components * sizeof(StorageType);
     in->read_image(FileFormat,
                    (uchar *)readpixels + (height - 1) * scanlinesize,
                    AutoStride,
@@ -122,7 +123,7 @@ static void oiio_load_pixels(const ImageMetaData &metadata,
   }
 
   if (components > 4) {
-    size_t dimensions = ((size_t)width) * height;
+    size_t dimensions = width * height;
     for (size_t i = dimensions - 1, pixel = 0; pixel < dimensions; pixel++, i--) {
       pixels[i * 4 + 3] = tmppixels[i * components + 3];
       pixels[i * 4 + 2] = tmppixels[i * components + 2];
@@ -137,7 +138,7 @@ static void oiio_load_pixels(const ImageMetaData &metadata,
   if (cmyk) {
     const StorageType one = util_image_cast_from_float<StorageType>(1.0f);
 
-    const size_t num_pixels = ((size_t)width) * height * depth;
+    const size_t num_pixels = width * height * depth;
     for (size_t i = num_pixels - 1, pixel = 0; pixel < num_pixels; pixel++, i--) {
       float c = util_image_cast_to_float(pixels[i * 4 + 0]);
       float m = util_image_cast_to_float(pixels[i * 4 + 1]);
@@ -149,6 +150,16 @@ static void oiio_load_pixels(const ImageMetaData &metadata,
       pixels[i * 4 + 3] = one;
     }
   }
+
+  if (components == 4 && associate_alpha) {
+    size_t dimensions = width * height;
+    for (size_t i = dimensions - 1, pixel = 0; pixel < dimensions; pixel++, i--) {
+      const StorageType alpha = pixels[i * 4 + 3];
+      pixels[i * 4 + 0] = util_image_multiply_native(pixels[i * 4 + 0], alpha);
+      pixels[i * 4 + 1] = util_image_multiply_native(pixels[i * 4 + 1], alpha);
+      pixels[i * 4 + 2] = util_image_multiply_native(pixels[i * 4 + 2], alpha);
+    }
+  }
 }
 
 bool OIIOImageLoader::load_pixels(const ImageMetaData &metadata,
@@ -172,33 +183,41 @@ bool OIIOImageLoader::load_pixels(const ImageMetaData &metadata,
   ImageSpec spec = ImageSpec();
   ImageSpec config = ImageSpec();
 
-  if (!associate_alpha) {
-    config.attribute("oiio:UnassociatedAlpha", 1);
-  }
+  /* Load without automatic OIIO alpha conversion, we do it ourselves. OIIO
+   * will associate alpha in the the 8bit buffer for PNGs, which leads to too
+   * much precision loss when we load it as half float to do a colorspace
+   * transform. */
+  config.attribute("oiio:UnassociatedAlpha", 1);
 
   if (!in->open(filepath.string(), spec, config)) {
     return false;
   }
 
+  const bool do_associate_alpha = associate_alpha &&
+                                  spec.get_int_attribute("oiio:UnassociatedAlpha", 0);
+
   switch (metadata.type) {
     case IMAGE_DATA_TYPE_BYTE:
     case IMAGE_DATA_TYPE_BYTE4:
-      oiio_load_pixels<TypeDesc::UINT8, uchar>(metadata, in, (uchar *)pixels);
+      oiio_load_pixels<TypeDesc::UINT8, uchar>(metadata, in, do_associate_alpha, (uchar *)pixels);
       break;
     case IMAGE_DATA_TYPE_USHORT:
     case IMAGE_DATA_TYPE_USHORT4:
-      oiio_load_pixels<TypeDesc::USHORT, uint16_t>(metadata, in, (uint16_t *)pixels);
+      oiio_load_pixels<TypeDesc::USHORT, uint16_t>(
+          metadata, in, do_associate_alpha, (uint16_t *)pixels);
       break;
     case IMAGE_DATA_TYPE_HALF:
     case IMAGE_DATA_TYPE_HALF4:
-      oiio_load_pixels<TypeDesc::HALF, half>(metadata, in, (half *)pixels);
+      oiio_load_pixels<TypeDesc::HALF, half>(metadata, in, do_associate_alpha, (half *)pixels);
       break;
     case IMAGE_DATA_TYPE_FLOAT:
     case IMAGE_DATA_TYPE_FLOAT4:
-      oiio_load_pixels<TypeDesc::FLOAT, float>(metadata, in, (float *)pixels);
+      oiio_load_pixels<TypeDesc::FLOAT, float>(metadata, in, do_associate_alpha, (float *)pixels);
       break;
     case IMAGE_DATA_TYPE_NANOVDB_FLOAT:
     case IMAGE_DATA_TYPE_NANOVDB_FLOAT3:
+    case IMAGE_DATA_TYPE_NANOVDB_FPN:
+    case IMAGE_DATA_TYPE_NANOVDB_FP16:
     case IMAGE_DATA_NUM_TYPES:
       break;
   }
diff --git a/intern/cycles/scene/image_vdb.cpp b/intern/cycles/scene/image_vdb.cpp
index b6f0911fa2c..2209be60a97 100644
--- a/intern/cycles/scene/image_vdb.cpp
+++ b/intern/cycles/scene/image_vdb.cpp
@@ -44,14 +44,30 @@ struct ToDenseOp {
 #  ifdef WITH_NANOVDB
 struct ToNanoOp {
   nanovdb::GridHandle<> nanogrid;
+  int precision;
 
   template<typename GridType, typename FloatGridType, typename FloatDataType, int channels>
   bool operator()(const openvdb::GridBase::ConstPtr &grid)
   {
     if constexpr (!std::is_same_v<GridType, openvdb::MaskGrid>) {
       try {
-        nanogrid = nanovdb::openToNanoVDB(
-            FloatGridType(*openvdb::gridConstPtrCast<GridType>(grid)));
+        FloatGridType floatgrid(*openvdb::gridConstPtrCast<GridType>(grid));
+        if constexpr (std::is_same_v<FloatGridType, openvdb::FloatGrid>) {
+          if (precision == 0) {
+            nanogrid = nanovdb::openToNanoVDB<nanovdb::HostBuffer,
+                                              typename FloatGridType::TreeType,
+                                              nanovdb::FpN>(floatgrid);
+            return true;
+          }
+          else if (precision == 16) {
+            nanogrid = nanovdb::openToNanoVDB<nanovdb::HostBuffer,
+                                              typename FloatGridType::TreeType,
+                                              nanovdb::Fp16>(floatgrid);
+            return true;
+          }
+        }
+
+        nanogrid = nanovdb::openToNanoVDB(floatgrid);
       }
       catch (const std::exception &e) {
         VLOG(1) << "Error converting OpenVDB to NanoVDB grid: " << e.what();
@@ -98,10 +114,13 @@ bool VDBImageLoader::load_metadata(const ImageDeviceFeatures &features, ImageMet
 #  ifdef WITH_NANOVDB
   if (features.has_nanovdb) {
     /* NanoVDB expects no inactive leaf nodes. */
-    /*openvdb::FloatGrid &pruned_grid = *openvdb::gridPtrCast<openvdb::FloatGrid>(grid);
+#    if 0
+    openvdb::FloatGrid &pruned_grid = *openvdb::gridPtrCast<openvdb::FloatGrid>(grid);
     openvdb::tools::pruneInactive(pruned_grid.tree());
-    nanogrid = nanovdb::openToNanoVDB(pruned_grid);*/
+    nanogrid = nanovdb::openToNanoVDB(pruned_grid);
+#    endif
     ToNanoOp op;
+    op.precision = precision;
     if (!openvdb::grid_type_operation(grid, op)) {
       return false;
     }
@@ -124,7 +143,15 @@ bool VDBImageLoader::load_metadata(const ImageDeviceFeatures &features, ImageMet
   if (nanogrid) {
     metadata.byte_size = nanogrid.size();
     if (metadata.channels == 1) {
-      metadata.type = IMAGE_DATA_TYPE_NANOVDB_FLOAT;
+      if (precision == 0) {
+        metadata.type = IMAGE_DATA_TYPE_NANOVDB_FPN;
+      }
+      else if (precision == 16) {
+        metadata.type = IMAGE_DATA_TYPE_NANOVDB_FP16;
+      }
+      else {
+        metadata.type = IMAGE_DATA_TYPE_NANOVDB_FLOAT;
+      }
     }
     else {
       metadata.type = IMAGE_DATA_TYPE_NANOVDB_FLOAT3;
diff --git a/intern/cycles/scene/image_vdb.h b/intern/cycles/scene/image_vdb.h
index a5fd51915ef..ea5f6b0b3d9 100644
--- a/intern/cycles/scene/image_vdb.h
+++ b/intern/cycles/scene/image_vdb.h
@@ -51,6 +51,7 @@ class VDBImageLoader : public ImageLoader {
 #endif
 #ifdef WITH_NANOVDB
   nanovdb::GridHandle<> nanogrid;
+  int precision = 0;
 #endif
 };
 
diff --git a/intern/cycles/scene/object.cpp b/intern/cycles/scene/object.cpp
index 8015be6393b..ddd89a16640 100644
--- a/intern/cycles/scene/object.cpp
+++ b/intern/cycles/scene/object.cpp
@@ -327,9 +327,11 @@ float Object::compute_volume_step_size() const
           /* Auto detect step size. */
           float3 size = one_float3();
 #ifdef WITH_NANOVDB
-          /* Dimensions were not applied to image transform with NanOVDB (see image_vdb.cpp) */
+          /* Dimensions were not applied to image transform with NanoVDB (see image_vdb.cpp) */
           if (metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-              metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3)
+              metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3 &&
+              metadata.type != IMAGE_DATA_TYPE_NANOVDB_FPN &&
+              metadata.type != IMAGE_DATA_TYPE_NANOVDB_FP16)
 #endif
             size /= make_float3(metadata.width, metadata.height, metadata.depth);
 
diff --git a/intern/cycles/scene/scene.cpp b/intern/cycles/scene/scene.cpp
index b35242139ea..8b5604eba72 100644
--- a/intern/cycles/scene/scene.cpp
+++ b/intern/cycles/scene/scene.cpp
@@ -550,7 +550,7 @@ void Scene::update_kernel_features()
   dscene.data.integrator.use_caustics = false;
   if (has_caustics_caster && has_caustics_receiver && has_caustics_light) {
     dscene.data.integrator.use_caustics = true;
-    kernel_features |= KERNEL_FEATURE_NODE_RAYTRACE;
+    kernel_features |= KERNEL_FEATURE_MNEE;
   }
 
   if (bake_manager->get_baking()) {
@@ -597,6 +597,7 @@ static void log_kernel_features(const uint features)
           << "\n";
   VLOG(2) << "Use Shader Raytrace " << string_from_bool(features & KERNEL_FEATURE_NODE_RAYTRACE)
           << "\n";
+  VLOG(2) << "Use MNEE" << string_from_bool(features & KERNEL_FEATURE_MNEE) << "\n";
   VLOG(2) << "Use Transparent " << string_from_bool(features & KERNEL_FEATURE_TRANSPARENT) << "\n";
   VLOG(2) << "Use Denoising " << string_from_bool(features & KERNEL_FEATURE_DENOISING) << "\n";
   VLOG(2) << "Use Path Tracing " << string_from_bool(features & KERNEL_FEATURE_PATH_TRACING)
diff --git a/intern/cycles/scene/shader_graph.cpp b/intern/cycles/scene/shader_graph.cpp
index d44e12f5fab..f25d0b7c7b9 100644
--- a/intern/cycles/scene/shader_graph.cpp
+++ b/intern/cycles/scene/shader_graph.cpp
@@ -888,7 +888,7 @@ void ShaderGraph::default_inputs(bool do_osl)
 
 void ShaderGraph::refine_bump_nodes()
 {
-  /* we transverse the node graph looking for bump nodes, when we find them,
+  /* We transverse the node graph looking for bump nodes, when we find them,
    * like in bump_from_displacement(), we copy the sub-graph defined from "bump"
    * input to the inputs "center","dx" and "dy" What is in "bump" input is moved
    * to "center" input. */
@@ -898,18 +898,18 @@ void ShaderGraph::refine_bump_nodes()
       ShaderInput *bump_input = node->input("Height");
       ShaderNodeSet nodes_bump;
 
-      /* make 2 extra copies of the subgraph defined in Bump input */
+      /* Make 2 extra copies of the subgraph defined in Bump input. */
       ShaderNodeMap nodes_dx;
       ShaderNodeMap nodes_dy;
 
-      /* find dependencies for the given input */
+      /* Find dependencies for the given input. */
       find_dependencies(nodes_bump, bump_input);
 
       copy_nodes(nodes_bump, nodes_dx);
       copy_nodes(nodes_bump, nodes_dy);
 
-      /* mark nodes to indicate they are use for bump computation, so
-         that any texture coordinates are shifted by dx/dy when sampling */
+      /* Mark nodes to indicate they are use for bump computation, so
+       * that any texture coordinates are shifted by dx/dy when sampling. */
       foreach (ShaderNode *node, nodes_bump)
         node->bump = SHADER_BUMP_CENTER;
       foreach (NodePair &pair, nodes_dx)
@@ -924,7 +924,7 @@ void ShaderGraph::refine_bump_nodes()
       connect(out_dx, node->input("SampleX"));
       connect(out_dy, node->input("SampleY"));
 
-      /* add generated nodes */
+      /* Add generated nodes. */
       foreach (NodePair &pair, nodes_dx)
         add(pair.second);
       foreach (NodePair &pair, nodes_dy)
diff --git a/intern/cycles/scene/shader_nodes.cpp b/intern/cycles/scene/shader_nodes.cpp
index 03c152928d5..3b58556f601 100644
--- a/intern/cycles/scene/shader_nodes.cpp
+++ b/intern/cycles/scene/shader_nodes.cpp
@@ -5882,7 +5882,7 @@ void BlackbodyNode::constant_fold(const ConstantFolder &folder)
   if (folder.all_inputs_constant()) {
     const float3 rgb_rec709 = svm_math_blackbody_color_rec709(temperature);
     const float3 rgb = folder.scene->shader_manager->rec709_to_scene_linear(rgb_rec709);
-    folder.make_constant(rgb);
+    folder.make_constant(max(rgb, zero_float3()));
   }
 }
 
diff --git a/intern/cycles/session/denoising.cpp b/intern/cycles/session/denoising.cpp
index ff9ca1b4345..a9377d412e8 100644
--- a/intern/cycles/session/denoising.cpp
+++ b/intern/cycles/session/denoising.cpp
@@ -90,7 +90,7 @@ static vector<ChannelMapping> output_channels()
   return map;
 }
 
-/* Renderlayer Handling */
+/* Render-layer Handling. */
 
 bool DenoiseImageLayer::detect_denoising_channels()
 {
diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp
index 143628f1e30..dac36ab0135 100644
--- a/intern/cycles/test/render_graph_finalize_test.cpp
+++ b/intern/cycles/test/render_graph_finalize_test.cpp
@@ -946,7 +946,7 @@ TEST_F(RenderGraph, constant_fold_bright_contrast)
 TEST_F(RenderGraph, constant_fold_blackbody)
 {
   EXPECT_ANY_MESSAGE(log);
-  CORRECT_INFO_MESSAGE(log, "Folding Blackbody::Color to constant (3.94163, 0.226523, 0).");
+  CORRECT_INFO_MESSAGE(log, "Folding Blackbody::Color to constant (3.96553, 0.227897, 0).");
 
   builder
       .add_node(ShaderNodeBuilder<BlackbodyNode>(graph, "Blackbody").set("Temperature", 1200.0f))
diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h
index af33264e044..c668638eb02 100644
--- a/intern/cycles/util/half.h
+++ b/intern/cycles/util/half.h
@@ -74,9 +74,9 @@ struct half4 {
 ccl_device_inline half float_to_half_image(float f)
 {
 #if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__)
-  return half(f);
+  return half(min(f, 65504.0f));
 #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
-  return __float2half(f);
+  return __float2half(min(f, 65504.0f));
 #else
   const uint u = __float_as_uint(f);
   /* Sign bit, shifted to its position. */
@@ -139,9 +139,9 @@ ccl_device_inline float4 half4_to_float4_image(const half4 h)
 ccl_device_inline half float_to_half_display(const float f)
 {
 #if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__)
-  return half(f);
+  return half(min(f, 65504.0f));
 #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
-  return __float2half(f);
+  return __float2half(min(f, 65504.0f));
 #else
   const int x = __float_as_int((f > 0.0f) ? ((f < 65504.0f) ? f : 65504.0f) : 0.0f);
   const int absolute = x & 0x7FFFFFFF;
diff --git a/intern/cycles/util/image.h b/intern/cycles/util/image.h
index 9348125d072..17446a83e59 100644
--- a/intern/cycles/util/image.h
+++ b/intern/cycles/util/image.h
@@ -78,6 +78,26 @@ template<> inline half util_image_cast_from_float(float value)
   return float_to_half_image(value);
 }
 
+/* Multiply image pixels in native data format. */
+template<typename T> inline T util_image_multiply_native(T a, T b);
+
+template<> inline float util_image_multiply_native(float a, float b)
+{
+  return a * b;
+}
+template<> inline uchar util_image_multiply_native(uchar a, uchar b)
+{
+  return ((uint32_t)a * (uint32_t)b) / 255;
+}
+template<> inline uint16_t util_image_multiply_native(uint16_t a, uint16_t b)
+{
+  return ((uint32_t)a * (uint32_t)b) / 65535;
+}
+template<> inline half util_image_multiply_native(half a, half b)
+{
+  return float_to_half_image(half_to_float_image(a) * half_to_float_image(b));
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_IMAGE_H__ */
diff --git a/intern/cycles/util/texture.h b/intern/cycles/util/texture.h
index e8bb058a3c9..90e842933c2 100644
--- a/intern/cycles/util/texture.h
+++ b/intern/cycles/util/texture.h
@@ -37,6 +37,8 @@ typedef enum ImageDataType {
   IMAGE_DATA_TYPE_USHORT = 7,
   IMAGE_DATA_TYPE_NANOVDB_FLOAT = 8,
   IMAGE_DATA_TYPE_NANOVDB_FLOAT3 = 9,
+  IMAGE_DATA_TYPE_NANOVDB_FPN = 10,
+  IMAGE_DATA_TYPE_NANOVDB_FP16 = 11,
 
   IMAGE_DATA_NUM_TYPES
 } ImageDataType;
author	Sergey Sharybin <sergey@blender.org>	2022-06-08 11:58:35 +0300
committer	Sergey Sharybin <sergey@blender.org>	2022-06-08 11:58:35 +0300
commit	f31cef6248fd12039a4d2dfb76b26f3426477a70 (patch)
tree	4ffb531519115c06766354c4fa4acaebeeb5161e /intern/cycles
parent	15ee45fa029de175b5e366b0b1e9243a89dae543 (diff)
parent	5b0e9bd97504cd89f57dbbaa4814b7d0dd0f2ccd (diff)