diff options
author | Sergey Sharybin <sergey@blender.org> | 2022-06-08 11:58:35 +0300 |
---|---|---|
committer | Sergey Sharybin <sergey@blender.org> | 2022-06-08 11:58:35 +0300 |
commit | f31cef6248fd12039a4d2dfb76b26f3426477a70 (patch) | |
tree | 4ffb531519115c06766354c4fa4acaebeeb5161e /intern/cycles | |
parent | 15ee45fa029de175b5e366b0b1e9243a89dae543 (diff) | |
parent | 5b0e9bd97504cd89f57dbbaa4814b7d0dd0f2ccd (diff) |
Merge branch 'master' into cycles_oneapi
Diffstat (limited to 'intern/cycles')
73 files changed, 1066 insertions, 398 deletions
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py index 74b28b8ea21..05f27bdbd4d 100644 --- a/intern/cycles/blender/addon/__init__.py +++ b/intern/cycles/blender/addon/__init__.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2011-2022 Blender Foundation - -# <pep8 compliant> from __future__ import annotations bl_info = { diff --git a/intern/cycles/blender/addon/camera.py b/intern/cycles/blender/addon/camera.py index 0e78112699e..3c821c98128 100644 --- a/intern/cycles/blender/addon/camera.py +++ b/intern/cycles/blender/addon/camera.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2011-2022 Blender Foundation -# <pep8 compliant> - # Fit to match default projective camera with focal_length 50 and sensor_width 36. default_fisheye_polynomial = [ -1.1735143712967577e-05, diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index 724e1b8f727..e211f53cf31 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2011-2022 Blender Foundation - -# <pep8 compliant> from __future__ import annotations diff --git a/intern/cycles/blender/addon/operators.py b/intern/cycles/blender/addon/operators.py index e5d7f00a381..ab474cda0ab 100644 --- a/intern/cycles/blender/addon/operators.py +++ b/intern/cycles/blender/addon/operators.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2011-2022 Blender Foundation - -# <pep8 compliant> from __future__ import annotations import bpy diff --git a/intern/cycles/blender/addon/osl.py b/intern/cycles/blender/addon/osl.py index 9430dc5d115..1ee7ae421e3 100644 --- a/intern/cycles/blender/addon/osl.py +++ b/intern/cycles/blender/addon/osl.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2011-2022 Blender Foundation - -# <pep8 compliant> from __future__ import annotations import bpy diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py index 5eaa592a9de..cc6d574da99 100644 --- a/intern/cycles/blender/addon/presets.py +++ b/intern/cycles/blender/addon/presets.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2011-2022 Blender Foundation - -# <pep8 compliant> from __future__ import annotations from bl_operators.presets import AddPresetBase diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 021b61d4751..5835d29c088 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2011-2022 Blender Foundation - -# <pep8 compliant> from __future__ import annotations import bpy diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 71f64d7d360..8565ad7b263 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2011-2022 Blender Foundation - -# <pep8 compliant> from __future__ import annotations import bpy diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py index 531ecc177da..12880496dfd 100644 --- a/intern/cycles/blender/addon/version_update.py +++ b/intern/cycles/blender/addon/version_update.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2011-2022 Blender Foundation - -# <pep8 compliant> from __future__ import annotations import bpy diff --git a/intern/cycles/blender/image.cpp b/intern/cycles/blender/image.cpp index e01b72c1653..aea79ad60ad 100644 --- a/intern/cycles/blender/image.cpp +++ b/intern/cycles/blender/image.cpp @@ -7,6 +7,8 @@ #include "blender/session.h" #include "blender/util.h" +#include "util/half.h" + CCL_NAMESPACE_BEGIN /* Packed Images */ @@ -19,17 +21,40 @@ BlenderImageLoader::BlenderImageLoader(BL::Image b_image, frame(frame), tile_number(tile_number), /* Don't free cache for preview render to avoid race condition from T93560, to be fixed - properly later as we are close to release. */ + * properly later as we are close to release. */ free_cache(!is_preview_render && !b_image.has_data()) { } bool BlenderImageLoader::load_metadata(const ImageDeviceFeatures &, ImageMetaData &metadata) { - metadata.width = b_image.size()[0]; - metadata.height = b_image.size()[1]; + if (b_image.source() != BL::Image::source_TILED) { + /* Image sequence might have different dimensions, and hence needs to be handled in a special + * manner. + * NOTE: Currently the sequences are not handled by this image loader. */ + assert(b_image.source() != BL::Image::source_SEQUENCE); + + metadata.width = b_image.size()[0]; + metadata.height = b_image.size()[1]; + metadata.channels = b_image.channels(); + } + else { + /* Different UDIM tiles might have different resolutions, so get resolution from the actual + * tile. */ + BL::UDIMTile b_udim_tile = b_image.tiles.get(tile_number); + if (b_udim_tile) { + metadata.width = b_udim_tile.size()[0]; + metadata.height = b_udim_tile.size()[1]; + metadata.channels = b_udim_tile.channels(); + } + else { + metadata.width = 0; + metadata.height = 0; + metadata.channels = 0; + } + } + metadata.depth = 1; - metadata.channels = b_image.channels(); if (b_image.is_float()) { if (metadata.channels == 1) { @@ -62,80 +87,134 @@ bool BlenderImageLoader::load_metadata(const ImageDeviceFeatures &, ImageMetaDat } bool BlenderImageLoader::load_pixels(const ImageMetaData &metadata, - void *pixels, - const size_t pixels_size, + void *out_pixels, + const size_t out_pixels_size, const bool associate_alpha) { const size_t num_pixels = ((size_t)metadata.width) * metadata.height; const int channels = metadata.channels; - if (b_image.is_float()) { - /* image data */ - float *image_pixels; - image_pixels = image_get_float_pixels_for_frame(b_image, frame, tile_number); + if (metadata.type == IMAGE_DATA_TYPE_FLOAT || metadata.type == IMAGE_DATA_TYPE_FLOAT4) { + /* Float. */ + float *in_pixels = image_get_float_pixels_for_frame(b_image, frame, tile_number); - if (image_pixels && num_pixels * channels == pixels_size) { - memcpy(pixels, image_pixels, pixels_size * sizeof(float)); + if (in_pixels && num_pixels * channels == out_pixels_size) { + /* Straight copy pixel data. */ + memcpy(out_pixels, in_pixels, out_pixels_size * sizeof(float)); } else { + /* Missing or invalid pixel data. */ if (channels == 1) { - memset(pixels, 0, num_pixels * sizeof(float)); + memset(out_pixels, 0, num_pixels * sizeof(float)); } else { - const size_t num_pixels_safe = pixels_size / channels; - float *fp = (float *)pixels; - for (int i = 0; i < num_pixels_safe; i++, fp += channels) { - fp[0] = 1.0f; - fp[1] = 0.0f; - fp[2] = 1.0f; + const size_t num_pixels_safe = out_pixels_size / channels; + float *out_pixel = (float *)out_pixels; + for (int i = 0; i < num_pixels_safe; i++, out_pixel += channels) { + out_pixel[0] = 1.0f; + out_pixel[1] = 0.0f; + out_pixel[2] = 1.0f; if (channels == 4) { - fp[3] = 1.0f; + out_pixel[3] = 1.0f; } } } } - if (image_pixels) { - MEM_freeN(image_pixels); + if (in_pixels) { + MEM_freeN(in_pixels); } } - else { - unsigned char *image_pixels = image_get_pixels_for_frame(b_image, frame, tile_number); + else if (metadata.type == IMAGE_DATA_TYPE_HALF || metadata.type == IMAGE_DATA_TYPE_HALF4) { + /* Half float. Blender does not have a half type, but in some cases + * we up-sample byte to half to avoid precision loss for colorspace + * conversion. */ + unsigned char *in_pixels = image_get_pixels_for_frame(b_image, frame, tile_number); - if (image_pixels && num_pixels * channels == pixels_size) { - memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char)); + if (in_pixels && num_pixels * channels == out_pixels_size) { + /* Convert uchar to half. */ + const uchar *in_pixel = in_pixels; + half *out_pixel = (half *)out_pixels; + if (associate_alpha && channels == 4) { + for (size_t i = 0; i < num_pixels; i++, in_pixel += 4, out_pixel += 4) { + const float alpha = util_image_cast_to_float(in_pixel[3]); + out_pixel[0] = float_to_half_image(util_image_cast_to_float(in_pixel[0]) * alpha); + out_pixel[1] = float_to_half_image(util_image_cast_to_float(in_pixel[1]) * alpha); + out_pixel[2] = float_to_half_image(util_image_cast_to_float(in_pixel[2]) * alpha); + out_pixel[3] = float_to_half_image(alpha); + } + } + else { + for (size_t i = 0; i < num_pixels; i++) { + for (int c = 0; c < channels; c++, in_pixel++, out_pixel++) { + *out_pixel = float_to_half_image(util_image_cast_to_float(*in_pixel)); + } + } + } } else { + /* Missing or invalid pixel data. */ if (channels == 1) { - memset(pixels, 0, pixels_size * sizeof(unsigned char)); + memset(out_pixels, 0, num_pixels * sizeof(half)); } else { - const size_t num_pixels_safe = pixels_size / channels; - unsigned char *cp = (unsigned char *)pixels; - for (size_t i = 0; i < num_pixels_safe; i++, cp += channels) { - cp[0] = 255; - cp[1] = 0; - cp[2] = 255; + const size_t num_pixels_safe = out_pixels_size / channels; + half *out_pixel = (half *)out_pixels; + for (int i = 0; i < num_pixels_safe; i++, out_pixel += channels) { + out_pixel[0] = float_to_half_image(1.0f); + out_pixel[1] = float_to_half_image(0.0f); + out_pixel[2] = float_to_half_image(1.0f); if (channels == 4) { - cp[3] = 255; + out_pixel[3] = float_to_half_image(1.0f); } } } } - if (image_pixels) { - MEM_freeN(image_pixels); + if (in_pixels) { + MEM_freeN(in_pixels); } + } + else { + /* Byte. */ + unsigned char *in_pixels = image_get_pixels_for_frame(b_image, frame, tile_number); + + if (in_pixels && num_pixels * channels == out_pixels_size) { + /* Straight copy pixel data. */ + memcpy(out_pixels, in_pixels, out_pixels_size * sizeof(unsigned char)); - if (associate_alpha) { - /* Premultiply, byte images are always straight for Blender. */ - unsigned char *cp = (unsigned char *)pixels; - for (size_t i = 0; i < num_pixels; i++, cp += channels) { - cp[0] = (cp[0] * cp[3]) / 255; - cp[1] = (cp[1] * cp[3]) / 255; - cp[2] = (cp[2] * cp[3]) / 255; + if (associate_alpha && channels == 4) { + /* Premultiply, byte images are always straight for Blender. */ + unsigned char *out_pixel = (unsigned char *)out_pixels; + for (size_t i = 0; i < num_pixels; i++, out_pixel += 4) { + out_pixel[0] = (out_pixel[0] * out_pixel[3]) / 255; + out_pixel[1] = (out_pixel[1] * out_pixel[3]) / 255; + out_pixel[2] = (out_pixel[2] * out_pixel[3]) / 255; + } + } + } + else { + /* Missing or invalid pixel data. */ + if (channels == 1) { + memset(out_pixels, 0, out_pixels_size * sizeof(unsigned char)); + } + else { + const size_t num_pixels_safe = out_pixels_size / channels; + unsigned char *out_pixel = (unsigned char *)out_pixels; + for (size_t i = 0; i < num_pixels_safe; i++, out_pixel += channels) { + out_pixel[0] = 255; + out_pixel[1] = 0; + out_pixel[2] = 255; + if (channels == 4) { + out_pixel[3] = 255; + } + } } } + + if (in_pixels) { + MEM_freeN(in_pixels); + } } /* Free image buffers to save memory during render. */ diff --git a/intern/cycles/blender/mesh.cpp b/intern/cycles/blender/mesh.cpp index c76ce3801d4..e2db52cc5c1 100644 --- a/intern/cycles/blender/mesh.cpp +++ b/intern/cycles/blender/mesh.cpp @@ -267,75 +267,62 @@ static void mikk_compute_tangents( genTangSpaceDefault(&context); } -/* Create sculpt vertex color attributes. */ -static void attr_create_sculpt_vertex_color(Scene *scene, - Mesh *mesh, - BL::Mesh &b_mesh, - bool subdivision) -{ - for (BL::MeshVertColorLayer &l : b_mesh.sculpt_vertex_colors) { - const bool active_render = l.active_render(); - AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE; - ustring vcol_name = ustring(l.name().c_str()); - - const bool need_vcol = mesh->need_attribute(scene, vcol_name) || - mesh->need_attribute(scene, vcol_std); - - if (!need_vcol) { - continue; - } - - AttributeSet &attributes = (subdivision) ? mesh->subd_attributes : mesh->attributes; - Attribute *vcol_attr = attributes.add(vcol_name, TypeRGBA, ATTR_ELEMENT_VERTEX); - vcol_attr->std = vcol_std; - - float4 *cdata = vcol_attr->data_float4(); - int numverts = b_mesh.vertices.length(); - - for (int i = 0; i < numverts; i++) { - *(cdata++) = get_float4(l.data[i].color()); - } - } -} - template<typename TypeInCycles, typename GetValueAtIndex> static void fill_generic_attribute(BL::Mesh &b_mesh, TypeInCycles *data, const BL::Attribute::domain_enum b_domain, + const bool subdivision, const GetValueAtIndex &get_value_at_index) { switch (b_domain) { case BL::Attribute::domain_CORNER: { - for (BL::MeshLoopTriangle &t : b_mesh.loop_triangles) { - const int index = t.index() * 3; - BL::Array<int, 3> loops = t.loops(); - data[index] = get_value_at_index(loops[0]); - data[index + 1] = get_value_at_index(loops[1]); - data[index + 2] = get_value_at_index(loops[2]); + if (subdivision) { + for (BL::MeshPolygon &p : b_mesh.polygons) { + int n = p.loop_total(); + for (int i = 0; i < n; i++) { + *data = get_value_at_index(p.loop_start() + i); + data++; + } + } + } + else { + for (BL::MeshLoopTriangle &t : b_mesh.loop_triangles) { + const int index = t.index() * 3; + BL::Array<int, 3> loops = t.loops(); + data[index] = get_value_at_index(loops[0]); + data[index + 1] = get_value_at_index(loops[1]); + data[index + 2] = get_value_at_index(loops[2]); + } } break; } case BL::Attribute::domain_EDGE: { - /* Average edge attributes at vertices. */ - const size_t num_verts = b_mesh.vertices.length(); - vector<int> count(num_verts, 0); - - for (BL::MeshEdge &e : b_mesh.edges) { - BL::Array<int, 2> vertices = e.vertices(); - TypeInCycles value = get_value_at_index(e.index()); - - data[vertices[0]] += value; - data[vertices[1]] += value; - count[vertices[0]]++; - count[vertices[1]]++; + if constexpr (std::is_same_v<TypeInCycles, uchar4>) { + /* uchar4 edge attributes do not exist, and averaging in place + * would not work. */ + assert(0); } + else { + /* Average edge attributes at vertices. */ + const size_t num_verts = b_mesh.vertices.length(); + vector<int> count(num_verts, 0); + + for (BL::MeshEdge &e : b_mesh.edges) { + BL::Array<int, 2> vertices = e.vertices(); + TypeInCycles value = get_value_at_index(e.index()); + + data[vertices[0]] += value; + data[vertices[1]] += value; + count[vertices[0]]++; + count[vertices[1]]++; + } - for (size_t i = 0; i < num_verts; i++) { - if (count[i] > 1) { - data[i] /= (float)count[i]; + for (size_t i = 0; i < num_verts; i++) { + if (count[i] > 1) { + data[i] /= (float)count[i]; + } } } - break; } case BL::Attribute::domain_POINT: { @@ -346,8 +333,16 @@ static void fill_generic_attribute(BL::Mesh &b_mesh, break; } case BL::Attribute::domain_FACE: { - for (BL::MeshLoopTriangle &t : b_mesh.loop_triangles) { - data[t.index()] = get_value_at_index(t.polygon_index()); + if (subdivision) { + const int num_polygons = b_mesh.polygons.length(); + for (int i = 0; i < num_polygons; i++) { + data[i] = get_value_at_index(i); + } + } + else { + for (BL::MeshLoopTriangle &t : b_mesh.loop_triangles) { + data[t.index()] = get_value_at_index(t.polygon_index()); + } } break; } @@ -395,21 +390,22 @@ static void attr_create_generic(Scene *scene, const bool need_motion, const float motion_scale) { - if (subdivision) { - /* TODO: Handle subdivision correctly. */ - return; - } - AttributeSet &attributes = mesh->attributes; + AttributeSet &attributes = (subdivision) ? mesh->subd_attributes : mesh->attributes; static const ustring u_velocity("velocity"); + int attribute_index = 0; + int render_color_index = b_mesh.attributes.render_color_index(); + for (BL::Attribute &b_attribute : b_mesh.attributes) { const ustring name{b_attribute.name().c_str()}; + const bool is_render_color = (attribute_index++ == render_color_index); if (need_motion && name == u_velocity) { attr_create_motion(mesh, b_attribute, motion_scale); } - if (!mesh->need_attribute(scene, name)) { + if (!(mesh->need_attribute(scene, name) || + (is_render_color && mesh->need_attribute(scene, ATTR_STD_VERTEX_COLOR)))) { continue; } if (attributes.find(name)) { @@ -445,15 +441,16 @@ static void attr_create_generic(Scene *scene, BL::FloatAttribute b_float_attribute{b_attribute}; Attribute *attr = attributes.add(name, TypeFloat, element); float *data = attr->data_float(); - fill_generic_attribute( - b_mesh, data, b_domain, [&](int i) { return b_float_attribute.data[i].value(); }); + fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) { + return b_float_attribute.data[i].value(); + }); break; } case BL::Attribute::data_type_BOOLEAN: { BL::BoolAttribute b_bool_attribute{b_attribute}; Attribute *attr = attributes.add(name, TypeFloat, element); float *data = attr->data_float(); - fill_generic_attribute(b_mesh, data, b_domain, [&](int i) { + fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) { return (float)b_bool_attribute.data[i].value(); }); break; @@ -462,25 +459,59 @@ static void attr_create_generic(Scene *scene, BL::IntAttribute b_int_attribute{b_attribute}; Attribute *attr = attributes.add(name, TypeFloat, element); float *data = attr->data_float(); - fill_generic_attribute( - b_mesh, data, b_domain, [&](int i) { return (float)b_int_attribute.data[i].value(); }); + fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) { + return (float)b_int_attribute.data[i].value(); + }); break; } case BL::Attribute::data_type_FLOAT_VECTOR: { BL::FloatVectorAttribute b_vector_attribute{b_attribute}; Attribute *attr = attributes.add(name, TypeVector, element); float3 *data = attr->data_float3(); - fill_generic_attribute(b_mesh, data, b_domain, [&](int i) { + fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) { BL::Array<float, 3> v = b_vector_attribute.data[i].vector(); return make_float3(v[0], v[1], v[2]); }); break; } + case BL::Attribute::data_type_BYTE_COLOR: { + BL::ByteColorAttribute b_color_attribute{b_attribute}; + + if (element == ATTR_ELEMENT_CORNER) { + element = ATTR_ELEMENT_CORNER_BYTE; + } + Attribute *attr = attributes.add(name, TypeRGBA, element); + if (is_render_color) { + attr->std = ATTR_STD_VERTEX_COLOR; + } + + if (element == ATTR_ELEMENT_CORNER_BYTE) { + uchar4 *data = attr->data_uchar4(); + fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) { + /* Compress/encode vertex color using the sRGB curve. */ + const float4 c = get_float4(b_color_attribute.data[i].color()); + return color_float4_to_uchar4(color_linear_to_srgb_v4(c)); + }); + } + else { + float4 *data = attr->data_float4(); + fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) { + BL::Array<float, 4> v = b_color_attribute.data[i].color(); + return make_float4(v[0], v[1], v[2], v[3]); + }); + } + break; + } case BL::Attribute::data_type_FLOAT_COLOR: { BL::FloatColorAttribute b_color_attribute{b_attribute}; + Attribute *attr = attributes.add(name, TypeRGBA, element); + if (is_render_color) { + attr->std = ATTR_STD_VERTEX_COLOR; + } + float4 *data = attr->data_float4(); - fill_generic_attribute(b_mesh, data, b_domain, [&](int i) { + fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) { BL::Array<float, 4> v = b_color_attribute.data[i].color(); return make_float4(v[0], v[1], v[2], v[3]); }); @@ -490,7 +521,7 @@ static void attr_create_generic(Scene *scene, BL::Float2Attribute b_float2_attribute{b_attribute}; Attribute *attr = attributes.add(name, TypeFloat2, element); float2 *data = attr->data_float2(); - fill_generic_attribute(b_mesh, data, b_domain, [&](int i) { + fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) { BL::Array<float, 2> v = b_float2_attribute.data[i].vector(); return make_float2(v[0], v[1]); }); @@ -503,69 +534,6 @@ static void attr_create_generic(Scene *scene, } } -/* Create vertex color attributes. */ -static void attr_create_vertex_color(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh, bool subdivision) -{ - for (BL::MeshLoopColorLayer &l : b_mesh.vertex_colors) { - const bool active_render = l.active_render(); - AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE; - ustring vcol_name = ustring(l.name().c_str()); - - const bool need_vcol = mesh->need_attribute(scene, vcol_name) || - mesh->need_attribute(scene, vcol_std); - - if (!need_vcol) { - continue; - } - - Attribute *vcol_attr = NULL; - - if (subdivision) { - if (active_render) { - vcol_attr = mesh->subd_attributes.add(vcol_std, vcol_name); - } - else { - vcol_attr = mesh->subd_attributes.add(vcol_name, TypeRGBA, ATTR_ELEMENT_CORNER_BYTE); - } - - uchar4 *cdata = vcol_attr->data_uchar4(); - - for (BL::MeshPolygon &p : b_mesh.polygons) { - int n = p.loop_total(); - for (int i = 0; i < n; i++) { - float4 color = get_float4(l.data[p.loop_start() + i].color()); - /* Compress/encode vertex color using the sRGB curve. */ - *(cdata++) = color_float4_to_uchar4(color); - } - } - } - else { - if (active_render) { - vcol_attr = mesh->attributes.add(vcol_std, vcol_name); - } - else { - vcol_attr = mesh->attributes.add(vcol_name, TypeRGBA, ATTR_ELEMENT_CORNER_BYTE); - } - - uchar4 *cdata = vcol_attr->data_uchar4(); - - for (BL::MeshLoopTriangle &t : b_mesh.loop_triangles) { - int3 li = get_int3(t.loops()); - float4 c1 = get_float4(l.data[li[0]].color()); - float4 c2 = get_float4(l.data[li[1]].color()); - float4 c3 = get_float4(l.data[li[2]].color()); - - /* Compress/encode vertex color using the sRGB curve. */ - cdata[0] = color_float4_to_uchar4(c1); - cdata[1] = color_float4_to_uchar4(c2); - cdata[2] = color_float4_to_uchar4(c3); - - cdata += 3; - } - } - } -} - /* Create uv map attributes. */ static void attr_create_uv_map(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh) { @@ -1029,8 +997,6 @@ static void create_mesh(Scene *scene, * The calculate functions will check whether they're needed or not. */ attr_create_pointiness(scene, mesh, b_mesh, subdivision); - attr_create_vertex_color(scene, mesh, b_mesh, subdivision); - attr_create_sculpt_vertex_color(scene, mesh, b_mesh, subdivision); attr_create_random_per_island(scene, mesh, b_mesh, subdivision); attr_create_generic(scene, mesh, b_mesh, subdivision, need_motion, motion_scale); diff --git a/intern/cycles/blender/volume.cpp b/intern/cycles/blender/volume.cpp index 8dd2d45c0b6..61b2f9ee276 100644 --- a/intern/cycles/blender/volume.cpp +++ b/intern/cycles/blender/volume.cpp @@ -219,7 +219,10 @@ static void sync_smoke_volume( class BlenderVolumeLoader : public VDBImageLoader { public: - BlenderVolumeLoader(BL::BlendData &b_data, BL::Volume &b_volume, const string &grid_name) + BlenderVolumeLoader(BL::BlendData &b_data, + BL::Volume &b_volume, + const string &grid_name, + BL::VolumeRender::precision_enum precision_) : VDBImageLoader(grid_name), b_volume(b_volume) { b_volume.grids.load(b_data.ptr.data); @@ -241,6 +244,22 @@ class BlenderVolumeLoader : public VDBImageLoader { } } #endif +#ifdef WITH_NANOVDB + switch (precision_) { + case BL::VolumeRender::precision_FULL: + precision = 32; + break; + case BL::VolumeRender::precision_HALF: + precision = 16; + break; + default: + case BL::VolumeRender::precision_VARIABLE: + precision = 0; + break; + } +#else + (void)precision_; +#endif } BL::Volume b_volume; @@ -318,7 +337,8 @@ static void sync_volume_object(BL::BlendData &b_data, volume->attributes.add(std) : volume->attributes.add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_VOXEL); - ImageLoader *loader = new BlenderVolumeLoader(b_data, b_volume, name.string()); + ImageLoader *loader = new BlenderVolumeLoader( + b_data, b_volume, name.string(), b_render.precision()); ImageParams params; params.frame = b_volume.grids.frame(); diff --git a/intern/cycles/bvh/build.cpp b/intern/cycles/bvh/build.cpp index 79e9b800690..1df3517673e 100644 --- a/intern/cycles/bvh/build.cpp +++ b/intern/cycles/bvh/build.cpp @@ -811,7 +811,7 @@ BVHNode *BVHBuild::build_node(const BVHRange &range, /* unalignedLeafSAH = params.sah_primitive_cost * split.leafSAH; */ unalignedSplitSAH = params.sah_node_cost * unaligned_split.bounds.half_area() + params.sah_primitive_cost * unaligned_split.nodeSAH; - /* TOOD(sergey): Check we can create leaf already. */ + /* TODO(sergey): Check we can create leaf already. */ /* Check whether unaligned split is better than the regular one. */ if (unalignedSplitSAH < splitSAH) { do_unalinged_split = true; diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake index 5b875565c97..6bcd8152874 100644 --- a/intern/cycles/cmake/external_libs.cmake +++ b/intern/cycles/cmake/external_libs.cmake @@ -146,8 +146,8 @@ if(CYCLES_STANDALONE_REPOSITORY) -DOIIO_STATIC_DEFINE ) - set(OPENIMAGEIO_INCLUDE_DIR ${OPENIMAGEIO_ROOT_DIR}/include) - set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO_INCLUDE_DIR} ${OPENIMAGEIO_INCLUDE_DIR}/OpenImageIO) + set(OPENIMAGEIO_INCLUDE_DIR ${OPENIMAGEIO_ROOT_DIR}/include) + set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO_INCLUDE_DIR} ${OPENIMAGEIO_INCLUDE_DIR}/OpenImageIO) # Special exceptions for libraries which needs explicit debug version set(OPENIMAGEIO_LIBRARIES optimized ${OPENIMAGEIO_ROOT_DIR}/lib/OpenImageIO.lib diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp index 6908ae5ead3..cb7e909a2d5 100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -457,6 +457,8 @@ void CUDADevice::reserve_local_memory(const uint kernel_features) /* Use the biggest kernel for estimation. */ const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE : + (kernel_features & KERNEL_FEATURE_MNEE) ? + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE : DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE; /* Launch kernel, using just 1 block appears sufficient to reserve memory for all @@ -1084,7 +1086,9 @@ void CUDADevice::tex_alloc(device_texture &mem) need_texture_info = true; if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && - mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { + mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3 && + mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FPN && + mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FP16) { CUDA_RESOURCE_DESC resDesc; memset(&resDesc, 0, sizeof(resDesc)); diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp index 7159277b325..ea68c821166 100644 --- a/intern/cycles/device/hip/device_impl.cpp +++ b/intern/cycles/device/hip/device_impl.cpp @@ -420,6 +420,8 @@ void HIPDevice::reserve_local_memory(const uint kernel_features) /* Use the biggest kernel for estimation. */ const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE : + (kernel_features & KERNEL_FEATURE_MNEE) ? + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE : DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE; /* Launch kernel, using just 1 block appears sufficient to reserve memory for all @@ -1042,7 +1044,9 @@ void HIPDevice::tex_alloc(device_texture &mem) need_texture_info = true; if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && - mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { + mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3 && + mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FPN && + mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FP16) { /* Bindless textures. */ hipResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); diff --git a/intern/cycles/device/kernel.cpp b/intern/cycles/device/kernel.cpp index 072731a2af5..96a99cd62cd 100644 --- a/intern/cycles/device/kernel.cpp +++ b/intern/cycles/device/kernel.cpp @@ -33,6 +33,8 @@ const char *device_kernel_as_string(DeviceKernel kernel) return "integrator_shade_surface"; case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: return "integrator_shade_surface_raytrace"; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: + return "integrator_shade_surface_mnee"; case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: return "integrator_shade_volume"; case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL: diff --git a/intern/cycles/device/memory.cpp b/intern/cycles/device/memory.cpp index 4c068dbdd3e..40cf2573cfb 100644 --- a/intern/cycles/device/memory.cpp +++ b/intern/cycles/device/memory.cpp @@ -165,6 +165,8 @@ device_texture::device_texture(Device *device, case IMAGE_DATA_TYPE_BYTE: case IMAGE_DATA_TYPE_NANOVDB_FLOAT: case IMAGE_DATA_TYPE_NANOVDB_FLOAT3: + case IMAGE_DATA_TYPE_NANOVDB_FPN: + case IMAGE_DATA_TYPE_NANOVDB_FP16: data_type = TYPE_UCHAR; data_elements = 1; break; diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm index 086fbb093ba..09c4ace081e 100644 --- a/intern/cycles/device/metal/bvh.mm +++ b/intern/cycles/device/metal/bvh.mm @@ -11,6 +11,7 @@ # include "util/progress.h" # include "device/metal/bvh.h" +# include "device/metal/util.h" CCL_NAMESPACE_BEGIN @@ -18,6 +19,7 @@ CCL_NAMESPACE_BEGIN { \ string str = string_printf(__VA_ARGS__); \ progress.set_substatus(str); \ + metal_printf("%s\n", str.c_str()); \ } BVHMetal::BVHMetal(const BVHParams ¶ms_, diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 7506b9b069f..0e6817d94f8 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -31,6 +31,8 @@ class MetalDevice : public Device { string source[PSO_NUM]; string source_md5[PSO_NUM]; + bool capture_enabled = false; + KernelParamsMetal launch_params = {0}; /* MetalRT members ----------------------------------*/ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index e1438a9d6e2..086bf0af979 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -86,6 +86,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile use_metalrt = (atoi(metalrt) != 0); } + if (getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) { + capture_enabled = true; + } + MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init]; arg_desc_params.dataType = MTLDataTypePointer; arg_desc_params.access = MTLArgumentAccessReadOnly; @@ -394,7 +398,7 @@ MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem) } if (size > 0) { - if (mem.type == MEM_DEVICE_ONLY) { + if (mem.type == MEM_DEVICE_ONLY && !capture_enabled) { options = MTLResourceStorageModePrivate; } @@ -697,8 +701,7 @@ void MetalDevice::tex_alloc_as_buffer(device_texture &mem) void MetalDevice::tex_alloc(device_texture &mem) { /* Check that dimensions fit within maximum allowable size. - See https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf - */ + * See: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf */ if (mem.data_width > 16384 || mem.data_height > 16384) { set_error(string_printf( "Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)", diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 304efc813ec..fec4cd80466 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -35,7 +35,8 @@ bool kernel_has_intersection(DeviceKernel device_kernel) device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE || device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK || - device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE); + device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE); } struct ShaderCache { diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index 6cc84a20787..de20514de0b 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -12,8 +12,6 @@ # include "device/metal/util.h" # include "kernel/device/metal/globals.h" -# define metal_printf VLOG(4) << string_printf - CCL_NAMESPACE_BEGIN class MetalDevice; @@ -77,6 +75,38 @@ class MetalDeviceQueue : public DeviceQueue { void close_compute_encoder(); void close_blit_encoder(); + + bool verbose_tracing = false; + + /* Per-kernel profiling (see CYCLES_METAL_PROFILING). */ + + struct TimingData { + DeviceKernel kernel; + int work_size; + uint64_t timing_id; + }; + std::vector<TimingData> command_encoder_labels; + id<MTLSharedEvent> timing_shared_event = nil; + uint64_t timing_shared_event_id; + uint64_t command_buffer_start_timing_id; + + struct TimingStats { + double total_time = 0.0; + uint64_t total_work_size = 0; + uint64_t num_dispatches = 0; + }; + TimingStats timing_stats[DEVICE_KERNEL_NUM]; + double last_completion_time = 0.0; + + /* .gputrace capture (see CYCLES_DEBUG_METAL_CAPTURE_...). */ + + id<MTLCaptureScope> mtlCaptureScope = nil; + DeviceKernel capture_kernel; + int capture_dispatch = 0; + int capture_dispatch_counter = 0; + bool is_capturing = false; + bool is_capturing_to_disk = false; + bool has_captured_to_disk = false; }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index ec10e091b25..8b2d5d81859 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -37,6 +37,61 @@ MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device) } wait_semaphore = dispatch_semaphore_create(0); + + if (@available(macos 10.14, *)) { + if (getenv("CYCLES_METAL_PROFILING")) { + /* Enable per-kernel timing breakdown (shown at end of render). */ + timing_shared_event = [mtlDevice newSharedEvent]; + } + if (getenv("CYCLES_METAL_DEBUG")) { + /* Enable very verbose tracing (shows every dispatch). */ + verbose_tracing = true; + } + timing_shared_event_id = 1; + } + + capture_kernel = DeviceKernel(-1); + if (auto capture_kernel_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) { + /* Enable .gputrace capture for the specified DeviceKernel. */ + MTLCaptureManager *captureManager = [MTLCaptureManager sharedCaptureManager]; + mtlCaptureScope = [captureManager newCaptureScopeWithDevice:mtlDevice]; + mtlCaptureScope.label = [NSString stringWithFormat:@"Cycles kernel dispatch"]; + [captureManager setDefaultCaptureScope:mtlCaptureScope]; + + capture_dispatch = -1; + if (auto capture_dispatch_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_DISPATCH")) { + capture_dispatch = atoi(capture_dispatch_str); + capture_dispatch_counter = 0; + } + + capture_kernel = DeviceKernel(atoi(capture_kernel_str)); + printf("Capture kernel: %d = %s\n", capture_kernel, device_kernel_as_string(capture_kernel)); + + if (auto capture_url = getenv("CYCLES_DEBUG_METAL_CAPTURE_URL")) { + if (@available(macos 10.15, *)) { + if ([captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]) { + + MTLCaptureDescriptor *captureDescriptor = [[MTLCaptureDescriptor alloc] init]; + captureDescriptor.captureObject = mtlCaptureScope; + captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument; + captureDescriptor.outputURL = [NSURL fileURLWithPath:@(capture_url)]; + + NSError *error; + if (![captureManager startCaptureWithDescriptor:captureDescriptor error:&error]) { + NSString *err = [error localizedDescription]; + printf("Start capture failed: %s\n", [err UTF8String]); + } + else { + printf("Capture started (URL: %s)\n", capture_url); + is_capturing_to_disk = true; + } + } + else { + printf("Capture to file is not supported\n"); + } + } + } + } } MetalDeviceQueue::~MetalDeviceQueue() @@ -58,6 +113,56 @@ MetalDeviceQueue::~MetalDeviceQueue() [mtlCommandQueue release]; mtlCommandQueue = nil; } + + if (mtlCaptureScope) { + [mtlCaptureScope release]; + } + + double total_time = 0.0; + + /* Show per-kernel timings, if gathered (see CYCLES_METAL_PROFILING). */ + int64_t total_work_size = 0; + int64_t num_dispatches = 0; + for (auto &stat : timing_stats) { + total_time += stat.total_time; + total_work_size += stat.total_work_size; + num_dispatches += stat.num_dispatches; + } + + if (num_dispatches) { + printf("\nMetal dispatch stats:\n\n"); + auto header = string_printf("%-40s %16s %12s %12s %7s %7s", + "Kernel name", + "Total threads", + "Dispatches", + "Avg. T/D", + "Time", + "Time%"); + auto divider = string(header.length(), '-'); + printf("%s\n%s\n%s\n", divider.c_str(), header.c_str(), divider.c_str()); + + for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) { + auto &stat = timing_stats[i]; + if (stat.num_dispatches > 0) { + printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n", + device_kernel_as_string(DeviceKernel(i)), + string_human_readable_number(stat.total_work_size).c_str(), + string_human_readable_number(stat.num_dispatches).c_str(), + string_human_readable_number(stat.total_work_size / stat.num_dispatches).c_str(), + stat.total_time, + stat.total_time * 100.0 / total_time); + } + } + printf("%s\n", divider.c_str()); + printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n", + "", + "", + string_human_readable_number(num_dispatches).c_str(), + "", + total_time, + 100.0); + printf("%s\n\n", divider.c_str()); + } } int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const @@ -101,6 +206,19 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, DeviceKernelArguments const &args) { + if (kernel == capture_kernel) { + if (capture_dispatch < 0 || capture_dispatch == capture_dispatch_counter) { + /* Start gputrace capture. */ + if (mtlCommandBuffer) { + synchronize(); + } + [mtlCaptureScope beginScope]; + printf("[mtlCaptureScope beginScope]\n"); + is_capturing = true; + } + capture_dispatch_counter += 1; + } + if (metal_device->have_error()) { return false; } @@ -110,6 +228,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel); + if (timing_shared_event) { + command_encoder_labels.push_back({kernel, work_size, timing_shared_event_id}); + } + /* Determine size requirement for argument buffer. */ size_t arg_buffer_length = 0; for (size_t i = 0; i < args.count; i++) { @@ -189,6 +311,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, /* Encode KernelParamsMetal buffers */ [metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer offset:globals_offsets]; + if (verbose_tracing || timing_shared_event || is_capturing) { + /* Add human-readable labels if we're doing any form of debugging / profiling. */ + mtlComputeCommandEncoder.label = [[NSString alloc] + initWithFormat:@"Metal queue launch %s, work_size %d", + device_kernel_as_string(kernel), + work_size]; + } + /* this relies on IntegratorStateGPU layout being contiguous device_ptrs */ const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) + sizeof(IntegratorStateGPU); @@ -196,7 +326,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, int pointer_index = offset / sizeof(device_ptr); MetalDevice::MetalMem *mmem = *( MetalDevice::MetalMem **)((uint8_t *)&metal_device->launch_params + offset); - if (mmem && (mmem->mtlBuffer || mmem->mtlTexture)) { + if (mmem && mmem->mem && (mmem->mtlBuffer || mmem->mtlTexture)) { [metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:pointer_index]; @@ -270,6 +400,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: break; default: bvhMetalRT = nil; @@ -343,12 +474,53 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } }]; + if (verbose_tracing || is_capturing) { + /* Force a sync we've enabled step-by-step verbose tracing or if we're capturing. */ + synchronize(); + + /* Show queue counters and dispatch timing. */ + if (verbose_tracing) { + if (kernel == DEVICE_KERNEL_INTEGRATOR_RESET) { + printf( + "_____________________________________.____________________.______________.___________" + "______________________________________\n"); + } + + printf("%-40s| %7d threads |%5.2fms | buckets [", + device_kernel_as_string(kernel), + work_size, + last_completion_time * 1000.0); + std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex); + for (auto &it : metal_device->metal_mem_map) { + const string c_integrator_queue_counter = "integrator_queue_counter"; + if (it.first->name == c_integrator_queue_counter) { + /* Workaround "device_copy_from" being protected. */ + struct MyDeviceMemory : device_memory { + void device_copy_from__IntegratorQueueCounter() + { + device_copy_from(0, data_width, 1, sizeof(IntegratorQueueCounter)); + } + }; + ((MyDeviceMemory *)it.first)->device_copy_from__IntegratorQueueCounter(); + + if (IntegratorQueueCounter *queue_counter = (IntegratorQueueCounter *) + it.first->host_pointer) { + for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) + printf("%s%d", i == 0 ? "" : ",", int(queue_counter->num_queued[i])); + } + break; + } + } + printf("]\n"); + } + } + return !(metal_device->have_error()); } bool MetalDeviceQueue::synchronize() { - if (metal_device->have_error()) { + if (has_captured_to_disk || metal_device->have_error()) { return false; } @@ -358,6 +530,28 @@ bool MetalDeviceQueue::synchronize() close_blit_encoder(); if (mtlCommandBuffer) { + scoped_timer timer; + if (timing_shared_event) { + /* For per-kernel timing, add event handlers to measure & accumulate dispatch times. */ + __block double completion_time = 0; + for (uint64_t i = command_buffer_start_timing_id; i < timing_shared_event_id; i++) { + [timing_shared_event notifyListener:shared_event_listener + atValue:i + block:^(id<MTLSharedEvent> sharedEvent, uint64_t value) { + completion_time = timer.get_time() - completion_time; + last_completion_time = completion_time; + for (auto label : command_encoder_labels) { + if (label.timing_id == value) { + TimingStats &stat = timing_stats[label.kernel]; + stat.num_dispatches++; + stat.total_time += completion_time; + stat.total_work_size += label.work_size; + } + } + }]; + } + } + uint64_t shared_event_id = this->shared_event_id++; if (@available(macos 10.14, *)) { @@ -373,6 +567,22 @@ bool MetalDeviceQueue::synchronize() dispatch_semaphore_wait(wait_semaphore, DISPATCH_TIME_FOREVER); } + if (is_capturing) { + [mtlCaptureScope endScope]; + is_capturing = false; + printf("[mtlCaptureScope endScope]\n"); + + if (is_capturing_to_disk) { + if (@available(macos 10.15, *)) { + [[MTLCaptureManager sharedCaptureManager] stopCapture]; + has_captured_to_disk = true; + is_capturing_to_disk = false; + is_capturing = false; + printf("Capture stopped\n"); + } + } + } + [mtlCommandBuffer release]; for (const CopyBack &mmem : copy_back_mem) { @@ -384,6 +594,7 @@ bool MetalDeviceQueue::synchronize() metal_device->flush_delayed_free_list(); mtlCommandBuffer = nil; + command_encoder_labels.clear(); } return !(metal_device->have_error()); @@ -529,6 +740,13 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel { bool concurrent = (kernel < DEVICE_KERNEL_INTEGRATOR_NUM); + if (timing_shared_event) { + /* Close the current encoder to ensure we're able to capture per-encoder timing data. */ + if (mtlComputeEncoder) { + close_compute_encoder(); + } + } + if (@available(macos 10.14, *)) { if (mtlComputeEncoder) { if (mtlComputeEncoder.dispatchType == concurrent ? MTLDispatchTypeConcurrent : @@ -574,6 +792,7 @@ id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder() if (!mtlCommandBuffer) { mtlCommandBuffer = [mtlCommandQueue commandBuffer]; [mtlCommandBuffer retain]; + command_buffer_start_timing_id = timing_shared_event_id; } mtlBlitEncoder = [mtlCommandBuffer blitCommandEncoder]; @@ -584,6 +803,10 @@ void MetalDeviceQueue::close_compute_encoder() { [mtlComputeEncoder endEncoding]; mtlComputeEncoder = nil; + + if (timing_shared_event) { + [mtlCommandBuffer encodeSignalEvent:timing_shared_event value:timing_shared_event_id++]; + } } void MetalDeviceQueue::close_blit_encoder() diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h index cc653ab7e12..f728967835d 100644 --- a/intern/cycles/device/metal/util.h +++ b/intern/cycles/device/metal/util.h @@ -14,6 +14,8 @@ # include "util/thread.h" +# define metal_printf VLOG(4) << string_printf + CCL_NAMESPACE_BEGIN enum MetalGPUVendor { diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp index 9fc265bc327..9ab9bbb59c5 100644 --- a/intern/cycles/device/optix/device_impl.cpp +++ b/intern/cycles/device/optix/device_impl.cpp @@ -452,9 +452,10 @@ bool OptiXDevice::load_kernels(const uint kernel_features) } { /* Load and compile PTX module with OptiX kernels. */ - string ptx_data, ptx_filename = path_get((kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? - "lib/kernel_optix_shader_raytrace.ptx" : - "lib/kernel_optix.ptx"); + string ptx_data, ptx_filename = path_get( + (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ? + "lib/kernel_optix_shader_raytrace.ptx" : + "lib/kernel_optix.ptx"); if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) { if (!getenv("OPTIX_ROOT_DIR")) { set_error( @@ -464,7 +465,9 @@ bool OptiXDevice::load_kernels(const uint kernel_features) } ptx_filename = compile_kernel( kernel_features, - (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? "kernel_shader_raytrace" : "kernel", + (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ? + "kernel_shader_raytrace" : + "kernel", "optix", true); } @@ -550,7 +553,8 @@ bool OptiXDevice::load_kernels(const uint kernel_features) OptixBuiltinISOptions builtin_options = {}; # if OPTIX_ABI_VERSION >= 55 builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM; - builtin_options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE; + builtin_options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | + OPTIX_BUILD_FLAG_ALLOW_COMPACTION; builtin_options.curveEndcapFlags = OPTIX_CURVE_ENDCAP_DEFAULT; /* Disable end-caps. */ # else builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; @@ -620,6 +624,14 @@ bool OptiXDevice::load_kernels(const uint kernel_features) "__direct_callable__svm_node_bevel"; } + /* MNEE. */ + if (kernel_features & KERNEL_FEATURE_MNEE) { + group_descs[PG_RGEN_SHADE_SURFACE_MNEE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; + group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.module = optix_module; + group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.entryFunctionName = + "__raygen__kernel_optix_integrator_shade_surface_mnee"; + } + optix_assert(optixProgramGroupCreate( context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups)); @@ -701,6 +713,46 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2)); } + if (kernel_features & KERNEL_FEATURE_MNEE) { + /* Create MNEE pipeline. */ + vector<OptixProgramGroup> pipeline_groups; + pipeline_groups.reserve(NUM_PROGRAM_GROUPS); + pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]); + pipeline_groups.push_back(groups[PG_MISS]); + pipeline_groups.push_back(groups[PG_HITD]); + pipeline_groups.push_back(groups[PG_HITS]); + pipeline_groups.push_back(groups[PG_HITL]); + pipeline_groups.push_back(groups[PG_HITV]); + if (motion_blur) { + pipeline_groups.push_back(groups[PG_HITD_MOTION]); + pipeline_groups.push_back(groups[PG_HITS_MOTION]); + } + if (kernel_features & KERNEL_FEATURE_POINTCLOUD) { + pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]); + pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]); + } + pipeline_groups.push_back(groups[PG_CALL_SVM_AO]); + pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]); + + optix_assert(optixPipelineCreate(context, + &pipeline_options, + &link_options, + pipeline_groups.data(), + pipeline_groups.size(), + nullptr, + 0, + &pipelines[PIP_SHADE_MNEE])); + + /* Combine ray generation and trace continuation stack size. */ + const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG + + link_options.maxTraceDepth * trace_css; + const unsigned int dss = 0; + + /* Set stack size depending on pipeline options. */ + optix_assert( + optixPipelineSetStackSize(pipelines[PIP_SHADE_MNEE], 0, dss, css, motion_blur ? 3 : 2)); + } + { /* Create intersection-only pipeline. */ vector<OptixProgramGroup> pipeline_groups; pipeline_groups.reserve(NUM_PROGRAM_GROUPS); @@ -1186,7 +1238,7 @@ bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context) const OptixResult result = optixDenoiserSetup( denoiser_.optix_denoiser, 0, /* Work around bug in r495 drivers that causes artifacts when denoiser setup is called - on a stream that is not the default stream */ + * on a stream that is not the default stream. */ tile_size.x + denoiser_.sizes.overlapWindowSizeInPixels * 2, tile_size.y + denoiser_.sizes.overlapWindowSizeInPixels * 2, denoiser_.state.device_pointer, @@ -1336,7 +1388,10 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh, OptixAccelBufferSizes sizes = {}; OptixAccelBuildOptions options = {}; options.operation = operation; - if (use_fast_trace_bvh) { + if (use_fast_trace_bvh || + /* The build flags have to match the ones used to query the built-in curve intersection + program (see optixBuiltinISModuleGet above) */ + build_input.type == OPTIX_BUILD_INPUT_TYPE_CURVES) { VLOG(2) << "Using fast to trace OptiX BVH"; options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION; } @@ -1861,7 +1916,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) { /* Can disable __anyhit__kernel_optix_visibility_test by default (except for thick curves, * since it needs to filter out end-caps there). - + * * It is enabled where necessary (visibility mask exceeds 8 bits or the other any-hit * programs like __anyhit__kernel_optix_shadow_all_hit) via OPTIX_RAY_FLAG_ENFORCE_ANYHIT. */ diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h index 1f53c729c3f..817afdc8384 100644 --- a/intern/cycles/device/optix/device_impl.h +++ b/intern/cycles/device/optix/device_impl.h @@ -24,6 +24,7 @@ enum { PG_RGEN_INTERSECT_SUBSURFACE, PG_RGEN_INTERSECT_VOLUME_STACK, PG_RGEN_SHADE_SURFACE_RAYTRACE, + PG_RGEN_SHADE_SURFACE_MNEE, PG_MISS, PG_HITD, /* Default hit group. */ PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */ @@ -46,7 +47,7 @@ static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO; static const int NUM_CALLABLE_PROGRAM_GROUPS = 2; /* List of OptiX pipelines. */ -enum { PIP_SHADE_RAYTRACE, PIP_INTERSECT, NUM_PIPELINES }; +enum { PIP_SHADE_RAYTRACE, PIP_SHADE_MNEE, PIP_INTERSECT, NUM_PIPELINES }; /* A single shader binding table entry. */ struct SbtRecord { diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp index d635512c58a..366bf95269d 100644 --- a/intern/cycles/device/optix/queue.cpp +++ b/intern/cycles/device/optix/queue.cpp @@ -28,6 +28,7 @@ void OptiXDeviceQueue::init_execution() static bool is_optix_specific_kernel(DeviceKernel kernel) { return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE || kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE || @@ -63,7 +64,8 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, cuda_stream_)); if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || - kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) { cuda_device_assert( cuda_device_, cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer), @@ -82,6 +84,10 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE]; sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord); break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: + pipeline = optix_device->pipelines[PIP_SHADE_MNEE]; + sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_MNEE * sizeof(SbtRecord); + break; case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: pipeline = optix_device->pipelines[PIP_INTERSECT]; sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord); diff --git a/intern/cycles/graph/node_enum.h b/intern/cycles/graph/node_enum.h index b3a55efb327..6c8bfed7ec8 100644 --- a/intern/cycles/graph/node_enum.h +++ b/intern/cycles/graph/node_enum.h @@ -19,8 +19,10 @@ struct NodeEnum { } void insert(const char *x, int y) { - left[ustring(x)] = y; - right[y] = ustring(x); + ustring ustr_x(x); + + left[ustr_x] = y; + right[y] = ustr_x; } bool exists(ustring x) const diff --git a/intern/cycles/hydra/display_driver.cpp b/intern/cycles/hydra/display_driver.cpp index a809ace63e2..0c0b577c358 100644 --- a/intern/cycles/hydra/display_driver.cpp +++ b/intern/cycles/hydra/display_driver.cpp @@ -23,10 +23,18 @@ HdCyclesDisplayDriver::HdCyclesDisplayDriver(HdCyclesSession *renderParam, Hgi * HdCyclesDisplayDriver::~HdCyclesDisplayDriver() { - deinit(); + if (texture_) { + _hgi->DestroyTexture(&texture_); + } + + if (gl_pbo_id_) { + glDeleteBuffers(1, &gl_pbo_id_); + } + + gl_context_dispose(); } -void HdCyclesDisplayDriver::init() +void HdCyclesDisplayDriver::gl_context_create() { #ifdef _WIN32 if (!gl_context_) { @@ -64,16 +72,42 @@ void HdCyclesDisplayDriver::init() } } -void HdCyclesDisplayDriver::deinit() +bool HdCyclesDisplayDriver::gl_context_enable() { - if (texture_) { - _hgi->DestroyTexture(&texture_); +#ifdef _WIN32 + if (!hdc_ || !gl_context_) { + return false; } - if (gl_pbo_id_) { - glDeleteBuffers(1, &gl_pbo_id_); + mutex_.lock(); + + // Do not change context if this is called in the main thread + if (wglGetCurrentContext() == nullptr) { + if (!TF_VERIFY(wglMakeCurrent((HDC)hdc_, (HGLRC)gl_context_))) { + mutex_.unlock(); + return false; + } + } + + return true; +#else + return false; +#endif +} + +void HdCyclesDisplayDriver::gl_context_disable() +{ +#ifdef _WIN32 + if (wglGetCurrentContext() == gl_context_) { + TF_VERIFY(wglMakeCurrent(nullptr, nullptr)); } + mutex_.unlock(); +#endif +} + +void HdCyclesDisplayDriver::gl_context_dispose() +{ #ifdef _WIN32 if (gl_context_) { TF_VERIFY(wglDeleteContext((HGLRC)gl_context_)); @@ -90,13 +124,9 @@ bool HdCyclesDisplayDriver::update_begin(const Params ¶ms, int texture_width, int texture_height) { -#ifdef _WIN32 - if (!hdc_ || !gl_context_) { + if (!gl_context_enable()) { return false; } -#endif - - graphics_interop_activate(); if (gl_render_sync_) { glWaitSync((GLsync)gl_render_sync_, 0, GL_TIMEOUT_IGNORED); @@ -121,15 +151,14 @@ bool HdCyclesDisplayDriver::update_begin(const Params ¶ms, void HdCyclesDisplayDriver::update_end() { gl_upload_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); - glFlush(); - graphics_interop_deactivate(); + gl_context_disable(); } void HdCyclesDisplayDriver::flush() { - graphics_interop_activate(); + gl_context_enable(); if (gl_upload_sync_) { glWaitSync((GLsync)gl_upload_sync_, 0, GL_TIMEOUT_IGNORED); @@ -139,7 +168,7 @@ void HdCyclesDisplayDriver::flush() glWaitSync((GLsync)gl_render_sync_, 0, GL_TIMEOUT_IGNORED); } - graphics_interop_deactivate(); + gl_context_disable(); } half4 *HdCyclesDisplayDriver::map_texture_buffer() @@ -179,25 +208,12 @@ DisplayDriver::GraphicsInterop HdCyclesDisplayDriver::graphics_interop_get() void HdCyclesDisplayDriver::graphics_interop_activate() { - mutex_.lock(); - -#ifdef _WIN32 - // Do not change context if this is called in the main thread - if (wglGetCurrentContext() == nullptr) { - TF_VERIFY(wglMakeCurrent((HDC)hdc_, (HGLRC)gl_context_)); - } -#endif + gl_context_enable(); } void HdCyclesDisplayDriver::graphics_interop_deactivate() { -#ifdef _WIN32 - if (wglGetCurrentContext() == gl_context_) { - TF_VERIFY(wglMakeCurrent(nullptr, nullptr)); - } -#endif - - mutex_.unlock(); + gl_context_disable(); } void HdCyclesDisplayDriver::clear() @@ -214,7 +230,11 @@ void HdCyclesDisplayDriver::draw(const Params ¶ms) return; } - init(); + if (!renderBuffer->IsResourceUsed()) { + return; + } + + gl_context_create(); // Cycles 'DisplayDriver' only supports 'half4' format TF_VERIFY(renderBuffer->GetFormat() == HdFormatFloat16Vec4); @@ -255,7 +275,6 @@ void HdCyclesDisplayDriver::draw(const Params ¶ms) glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); gl_render_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); - glFlush(); need_update_ = false; diff --git a/intern/cycles/hydra/display_driver.h b/intern/cycles/hydra/display_driver.h index 20086830e6a..2a05397c325 100644 --- a/intern/cycles/hydra/display_driver.h +++ b/intern/cycles/hydra/display_driver.h @@ -19,9 +19,6 @@ class HdCyclesDisplayDriver final : public CCL_NS::DisplayDriver { ~HdCyclesDisplayDriver(); private: - void init(); - void deinit(); - void next_tile_begin() override; bool update_begin(const Params ¶ms, int texture_width, int texture_height) override; @@ -41,6 +38,11 @@ class HdCyclesDisplayDriver final : public CCL_NS::DisplayDriver { void draw(const Params ¶ms) override; + void gl_context_create(); + bool gl_context_enable(); + void gl_context_disable(); + void gl_context_dispose(); + HdCyclesSession *const _renderParam; Hgi *const _hgi; @@ -48,7 +50,6 @@ class HdCyclesDisplayDriver final : public CCL_NS::DisplayDriver { void *hdc_ = nullptr; void *gl_context_ = nullptr; #endif - CCL_NS::thread_mutex mutex_; PXR_NS::HgiTextureHandle texture_; diff --git a/intern/cycles/hydra/output_driver.cpp b/intern/cycles/hydra/output_driver.cpp index c5f64ac1c18..f4ea853f243 100644 --- a/intern/cycles/hydra/output_driver.cpp +++ b/intern/cycles/hydra/output_driver.cpp @@ -30,11 +30,11 @@ bool HdCyclesOutputDriver::update_render_tile(const Tile &tile) std::vector<float> pixels; for (const HdRenderPassAovBinding &aovBinding : _renderParam->GetAovBindings()) { - if (aovBinding == _renderParam->GetDisplayAovBinding()) { - continue; // Display AOV binding is already updated by Cycles display driver - } - if (const auto renderBuffer = static_cast<HdCyclesRenderBuffer *>(aovBinding.renderBuffer)) { + if (aovBinding == _renderParam->GetDisplayAovBinding() && renderBuffer->IsResourceUsed()) { + continue; // Display AOV binding is already updated by Cycles display driver + } + const HdFormat format = renderBuffer->GetFormat(); if (format == HdFormatInvalid) { continue; // Skip invalid AOV bindings diff --git a/intern/cycles/hydra/render_buffer.cpp b/intern/cycles/hydra/render_buffer.cpp index 4867def0624..4d8b21d1e61 100644 --- a/intern/cycles/hydra/render_buffer.cpp +++ b/intern/cycles/hydra/render_buffer.cpp @@ -35,7 +35,7 @@ bool HdCyclesRenderBuffer::Allocate(const GfVec3i &dimensions, HdFormat format, return false; } - const size_t oldSize = _data.size(); + const size_t oldSize = _dataSize; const size_t newSize = dimensions[0] * dimensions[1] * HdDataSizeOfFormat(format); if (oldSize == newSize) { return true; @@ -49,8 +49,8 @@ bool HdCyclesRenderBuffer::Allocate(const GfVec3i &dimensions, HdFormat format, _width = dimensions[0]; _height = dimensions[1]; _format = format; - - _data.resize(newSize); + _dataSize = newSize; + _resourceUsed = false; return true; } @@ -63,6 +63,7 @@ void HdCyclesRenderBuffer::_Deallocate() _data.clear(); _data.shrink_to_fit(); + _dataSize = 0; _resource = VtValue(); } @@ -74,6 +75,10 @@ void *HdCyclesRenderBuffer::Map() return nullptr; } + if (_data.size() != _dataSize) { + _data.resize(_dataSize); + } + ++_mapped; return _data.data(); @@ -103,10 +108,17 @@ void HdCyclesRenderBuffer::SetConverged(bool converged) _converged = converged; } +bool HdCyclesRenderBuffer::IsResourceUsed() const +{ + return _resourceUsed; +} + VtValue HdCyclesRenderBuffer::GetResource(bool multiSampled) const { TF_UNUSED(multiSampled); + _resourceUsed = true; + return _resource; } diff --git a/intern/cycles/hydra/render_buffer.h b/intern/cycles/hydra/render_buffer.h index 8eb874f0068..90629d4aee0 100644 --- a/intern/cycles/hydra/render_buffer.h +++ b/intern/cycles/hydra/render_buffer.h @@ -58,6 +58,8 @@ class HdCyclesRenderBuffer final : public PXR_NS::HdRenderBuffer { void SetConverged(bool converged); + bool IsResourceUsed() const; + PXR_NS::VtValue GetResource(bool multiSampled = false) const override; void SetResource(const PXR_NS::VtValue &resource); @@ -74,9 +76,11 @@ class HdCyclesRenderBuffer final : public PXR_NS::HdRenderBuffer { unsigned int _width = 0u; unsigned int _height = 0u; PXR_NS::HdFormat _format = PXR_NS::HdFormatInvalid; + size_t _dataSize = 0; std::vector<uint8_t> _data; PXR_NS::VtValue _resource; + mutable std::atomic_bool _resourceUsed = false; std::atomic_int _mapped = 0; std::atomic_bool _converged = false; diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index 8306460d607..ede81705ae8 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -65,6 +65,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device, integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE), integrator_shader_raytrace_sort_counter_( device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE), + integrator_shader_mnee_sort_counter_( + device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE), integrator_shader_sort_prefix_sum_( device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE), integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE), @@ -188,6 +190,9 @@ void PathTraceWorkGPU::alloc_integrator_sorting() integrator_shader_raytrace_sort_counter_.alloc(max_shaders); integrator_shader_raytrace_sort_counter_.zero_to_device(); + integrator_shader_mnee_sort_counter_.alloc(max_shaders); + integrator_shader_mnee_sort_counter_.zero_to_device(); + integrator_shader_sort_prefix_sum_.alloc(max_shaders); integrator_shader_sort_prefix_sum_.zero_to_device(); @@ -195,6 +200,8 @@ void PathTraceWorkGPU::alloc_integrator_sorting() (int *)integrator_shader_sort_counter_.device_pointer; integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] = (int *)integrator_shader_raytrace_sort_counter_.device_pointer; + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] = + (int *)integrator_shader_mnee_sort_counter_.device_pointer; } } @@ -327,6 +334,7 @@ void PathTraceWorkGPU::enqueue_reset() queue_->zero_to_device(integrator_queue_counter_); queue_->zero_to_device(integrator_shader_sort_counter_); queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); + queue_->zero_to_device(integrator_shader_mnee_sort_counter_); /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the * counter on the host side because `zero_to_device()` is not doing it. */ @@ -450,6 +458,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: { /* Shading kernels with integrator state and render buffer. */ DeviceKernelArguments args(&d_path_index, &buffers_->buffer.device_pointer, &work_size); @@ -1080,13 +1089,15 @@ int PathTraceWorkGPU::shadow_catcher_count_possible_splits() bool PathTraceWorkGPU::kernel_uses_sorting(DeviceKernel kernel) { return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE || - kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE); + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE); } bool PathTraceWorkGPU::kernel_creates_shadow_paths(DeviceKernel kernel) { return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE || kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE || kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME); } @@ -1094,7 +1105,8 @@ bool PathTraceWorkGPU::kernel_creates_ao_paths(DeviceKernel kernel) { return (device_scene_->data.kernel_features & KERNEL_FEATURE_AO) && (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE || - kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE); + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || + kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE); } bool PathTraceWorkGPU::kernel_is_shadow_path(DeviceKernel kernel) diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h index 90f8b8a4509..4c10a221a30 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.h +++ b/intern/cycles/integrator/path_trace_work_gpu.h @@ -133,6 +133,7 @@ class PathTraceWorkGPU : public PathTraceWork { /* Shader sorting. */ device_vector<int> integrator_shader_sort_counter_; device_vector<int> integrator_shader_raytrace_sort_counter_; + device_vector<int> integrator_shader_mnee_sort_counter_; device_vector<int> integrator_shader_sort_prefix_sum_; /* Path split. */ device_vector<int> integrator_next_main_path_index_; diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h index 3b714a3e580..7809ec5f4a7 100644 --- a/intern/cycles/kernel/device/cpu/image.h +++ b/intern/cycles/kernel/device/cpu/image.h @@ -817,6 +817,16 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg, } case IMAGE_DATA_TYPE_NANOVDB_FLOAT3: return NanoVDBInterpolator<nanovdb::Vec3f>::interp_3d(info, P.x, P.y, P.z, interp); + case IMAGE_DATA_TYPE_NANOVDB_FPN: { + const float f = NanoVDBInterpolator<nanovdb::FpN, float>::interp_3d( + info, P.x, P.y, P.z, interp); + return make_float4(f, f, f, 1.0f); + } + case IMAGE_DATA_TYPE_NANOVDB_FP16: { + const float f = NanoVDBInterpolator<nanovdb::Fp16, float>::interp_3d( + info, P.x, P.y, P.z, interp); + return make_float4(f, f, f, 1.0f); + } #endif default: assert(0); diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h index c5bc7d88e02..29d851ae478 100644 --- a/intern/cycles/kernel/device/gpu/image.h +++ b/intern/cycles/kernel/device/gpu/image.h @@ -125,7 +125,8 @@ kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, fl #ifdef WITH_NANOVDB template<typename T, typename S> -ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, float z) +ccl_device typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_tricubic_nanovdb( + S &s, float x, float y, float z) { float px = floorf(x); float py = floorf(y); @@ -157,7 +158,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl } template<typename T> -ccl_device_noinline T kernel_tex_image_interp_nanovdb( +ccl_device_noinline typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_nanovdb( ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation) { using namespace nanovdb; @@ -238,6 +239,14 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg, info, x, y, z, interpolation); return make_float4(f[0], f[1], f[2], 1.0f); } + if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FPN) { + float f = kernel_tex_image_interp_nanovdb<nanovdb::FpN>(info, x, y, z, interpolation); + return make_float4(f, f, f, 1.0f); + } + if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FP16) { + float f = kernel_tex_image_interp_nanovdb<nanovdb::Fp16>(info, x, y, z, interpolation); + return make_float4(f, f, f, 1.0f); + } #endif if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 || texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) { diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index 39f0d3c9cd0..b9a44ccad02 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -245,7 +245,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } ccl_gpu_kernel_postfix -#ifdef __KERNEL_METAL__ +#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__) constant int __dummy_constant [[function_constant(0)]]; #endif @@ -260,7 +260,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; -#ifdef __KERNEL_METAL__ +#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__) KernelGlobals kg = NULL; /* Workaround Ambient Occlusion and Bevel nodes not working with Metal. * Dummy offset should not affect result, but somehow fixes bug! */ @@ -274,6 +274,21 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) ccl_gpu_kernel_postfix ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) + ccl_gpu_kernel_signature(integrator_shade_surface_mnee, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int state = (path_index_array) ? path_index_array[global_index] : global_index; + ccl_gpu_kernel_call(integrator_shade_surface_mnee(NULL, state, render_buffer)); + } +} +ccl_gpu_kernel_postfix + +ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) ccl_gpu_kernel_signature(integrator_shade_volume, ccl_global const int *path_index_array, ccl_global float *render_buffer, @@ -651,8 +666,9 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb const int x = render_pixel_index % width; \ const int y = render_pixel_index / width; \ \ - ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \ - y * stride * kfilm_convert.pass_stride; \ + const uint64_t buffer_pixel_index = x + y * stride; \ + ccl_global const float *buffer = render_buffer + offset + \ + buffer_pixel_index * kfilm_convert.pass_stride; \ \ ccl_global float *pixel = pixels + \ (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \ @@ -681,8 +697,9 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb const int x = render_pixel_index % width; \ const int y = render_pixel_index / width; \ \ - ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \ - y * stride * kfilm_convert.pass_stride; \ + const uint64_t buffer_pixel_index = x + y * stride; \ + ccl_global const float *buffer = render_buffer + offset + \ + buffer_pixel_index * kfilm_convert.pass_stride; \ \ float pixel[4]; \ film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \ diff --git a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu index e2c5d2ff024..3bd57bc0f1a 100644 --- a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu +++ b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu @@ -15,3 +15,11 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_raytr global_index; integrator_shade_surface_raytrace(nullptr, path_index, __params.render_buffer); } + +extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_mnee() +{ + const int global_index = optixGetLaunchIndex().x; + const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] : + global_index; + integrator_shade_surface_mnee(nullptr, path_index, __params.render_buffer); +} diff --git a/intern/cycles/kernel/film/adaptive_sampling.h b/intern/cycles/kernel/film/adaptive_sampling.h index ad9b3b08ac5..16867c39d99 100644 --- a/intern/cycles/kernel/film/adaptive_sampling.h +++ b/intern/cycles/kernel/film/adaptive_sampling.h @@ -91,13 +91,13 @@ ccl_device void kernel_adaptive_sampling_filter_x(KernelGlobals kg, bool prev = false; for (int x = start_x; x < start_x + width; ++x) { int index = offset + x + y * stride; - ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride; + ccl_global float *buffer = render_buffer + (uint64_t)index * kernel_data.film.pass_stride; const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3; if (buffer[aux_w_offset] == 0.0f) { if (x > start_x && !prev) { index = index - 1; - buffer = render_buffer + index * kernel_data.film.pass_stride; + buffer = render_buffer + (uint64_t)index * kernel_data.film.pass_stride; buffer[aux_w_offset] = 0.0f; } prev = true; @@ -124,13 +124,13 @@ ccl_device void kernel_adaptive_sampling_filter_y(KernelGlobals kg, bool prev = false; for (int y = start_y; y < start_y + height; ++y) { int index = offset + x + y * stride; - ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride; + ccl_global float *buffer = render_buffer + (uint64_t)index * kernel_data.film.pass_stride; const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3; if (buffer[aux_w_offset] == 0.0f) { if (y > start_y && !prev) { index = index - stride; - buffer = render_buffer + index * kernel_data.film.pass_stride; + buffer = render_buffer + (uint64_t)index * kernel_data.film.pass_stride; buffer[aux_w_offset] = 0.0f; } prev = true; diff --git a/intern/cycles/kernel/integrator/init_from_bake.h b/intern/cycles/kernel/integrator/init_from_bake.h index 3772db845a8..0db4241b6e3 100644 --- a/intern/cycles/kernel/integrator/init_from_bake.h +++ b/intern/cycles/kernel/integrator/init_from_bake.h @@ -102,7 +102,7 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg, /* Setup render buffers. */ const int index = INTEGRATOR_STATE(state, path, render_pixel_index); const int pass_stride = kernel_data.film.pass_stride; - ccl_global float *buffer = render_buffer + index * pass_stride; + ccl_global float *buffer = render_buffer + (uint64_t)index * pass_stride; ccl_global float *primitive = buffer + kernel_data.film.pass_bake_primitive; ccl_global float *differential = buffer + kernel_data.film.pass_bake_differential; @@ -243,9 +243,12 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg, /* Setup next kernel to execute. */ const bool use_caustics = kernel_data.integrator.use_caustics && (object_flag & SD_OBJECT_CAUSTICS); - const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE) || use_caustics; + const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE); - if (use_raytrace_kernel) { + if (use_caustics) { + INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader_index); + } + else if (use_raytrace_kernel) { INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader_index); } else { diff --git a/intern/cycles/kernel/integrator/intersect_closest.h b/intern/cycles/kernel/integrator/intersect_closest.h index b8ce625c11b..2dfac44b414 100644 --- a/intern/cycles/kernel/integrator/intersect_closest.h +++ b/intern/cycles/kernel/integrator/intersect_closest.h @@ -125,9 +125,12 @@ ccl_device_forceinline void integrator_split_shadow_catcher( const int flags = kernel_tex_fetch(__shaders, shader).flags; const bool use_caustics = kernel_data.integrator.use_caustics && (object_flags & SD_OBJECT_CAUSTICS); - const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE) || use_caustics; + const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE); - if (use_raytrace_kernel) { + if (use_caustics) { + INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader); + } + else if (use_raytrace_kernel) { INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); } else { @@ -150,9 +153,13 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catche const int object_flags = intersection_get_object_flags(kg, &isect); const bool use_caustics = kernel_data.integrator.use_caustics && (object_flags & SD_OBJECT_CAUSTICS); - const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE) || use_caustics; + const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE); - if (use_raytrace_kernel) { + if (use_caustics) { + INTEGRATOR_PATH_NEXT_SORTED( + current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader); + } + else if (use_raytrace_kernel) { INTEGRATOR_PATH_NEXT_SORTED( current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); } @@ -222,8 +229,12 @@ ccl_device_forceinline void integrator_intersect_next_kernel( const int object_flags = intersection_get_object_flags(kg, isect); const bool use_caustics = kernel_data.integrator.use_caustics && (object_flags & SD_OBJECT_CAUSTICS); - const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE) || use_caustics; - if (use_raytrace_kernel) { + const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE); + if (use_caustics) { + INTEGRATOR_PATH_NEXT_SORTED( + current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader); + } + else if (use_raytrace_kernel) { INTEGRATOR_PATH_NEXT_SORTED( current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); } @@ -272,9 +283,13 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_volume( const int object_flags = intersection_get_object_flags(kg, isect); const bool use_caustics = kernel_data.integrator.use_caustics && (object_flags & SD_OBJECT_CAUSTICS); - const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE) || use_caustics; + const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE); - if (use_raytrace_kernel) { + if (use_caustics) { + INTEGRATOR_PATH_NEXT_SORTED( + current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader); + } + else if (use_raytrace_kernel) { INTEGRATOR_PATH_NEXT_SORTED( current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); } diff --git a/intern/cycles/kernel/integrator/megakernel.h b/intern/cycles/kernel/integrator/megakernel.h index a0c15794470..17ae13ad23f 100644 --- a/intern/cycles/kernel/integrator/megakernel.h +++ b/intern/cycles/kernel/integrator/megakernel.h @@ -77,6 +77,9 @@ ccl_device void integrator_megakernel(KernelGlobals kg, case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: integrator_shade_surface_raytrace(kg, state, render_buffer); break; + case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: + integrator_shade_surface_mnee(kg, state, render_buffer); + break; case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: integrator_shade_light(kg, state, render_buffer); break; diff --git a/intern/cycles/kernel/integrator/mnee.h b/intern/cycles/kernel/integrator/mnee.h index 2f7b711e28c..ad83f82d091 100644 --- a/intern/cycles/kernel/integrator/mnee.h +++ b/intern/cycles/kernel/integrator/mnee.h @@ -807,6 +807,15 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg, * and keep pdf in vertex area measure */ mnee_update_light_sample(kg, vertices[vertex_count - 1].p, ls); + /* Save state path bounce info in case a light path node is used in the refractive interface or + * light shader graph. */ + const int transmission_bounce = INTEGRATOR_STATE(state, path, transmission_bounce); + const int diffuse_bounce = INTEGRATOR_STATE(state, path, diffuse_bounce); + const int bounce = INTEGRATOR_STATE(state, path, bounce); + + /* Set diffuse bounce info . */ + INTEGRATOR_STATE_WRITE(state, path, diffuse_bounce) = diffuse_bounce + 1; + /* Evaluate light sample * in case the light has a node-based shader: * 1. sd_mnee will be used to store light data, which is why we need to do @@ -814,6 +823,12 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg, * interface data at the end of the call for the shadow ray setup to work. * 2. ls needs to contain the last interface data for the light shader to * evaluate properly */ + + /* Set bounce info in case a light path node is used in the light shader graph. */ + INTEGRATOR_STATE_WRITE(state, path, transmission_bounce) = transmission_bounce + vertex_count - + 1; + INTEGRATOR_STATE_WRITE(state, path, bounce) = bounce + vertex_count; + float3 light_eval = light_sample_shader_eval(kg, state, sd_mnee, ls, sd->time); bsdf_eval_mul3(throughput, light_eval / ls->pdf); @@ -885,6 +900,11 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg, false, LAMP_NONE); + /* Set bounce info in case a light path node is used in the refractive interface + * shader graph. */ + INTEGRATOR_STATE_WRITE(state, path, transmission_bounce) = transmission_bounce + vi; + INTEGRATOR_STATE_WRITE(state, path, bounce) = bounce + 1 + vi; + /* Evaluate shader nodes at solution vi. */ shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>( kg, state, sd_mnee, NULL, PATH_RAY_DIFFUSE, true); @@ -901,6 +921,11 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg, bsdf_eval_mul3(throughput, bsdf_contribution); } + /* Restore original state path bounce info. */ + INTEGRATOR_STATE_WRITE(state, path, transmission_bounce) = transmission_bounce; + INTEGRATOR_STATE_WRITE(state, path, diffuse_bounce) = diffuse_bounce; + INTEGRATOR_STATE_WRITE(state, path, bounce) = bounce; + return true; } @@ -1029,18 +1054,17 @@ ccl_device_forceinline int kernel_path_mnee_sample(KernelGlobals kg, return 0; /* Check whether the transmission depth limit is reached before continuing. */ - if (INTEGRATOR_STATE(state, path, transmission_bounce) + vertex_count >= + if ((INTEGRATOR_STATE(state, path, transmission_bounce) + vertex_count - 1) >= kernel_data.integrator.max_transmission_bounce) return 0; /* Check whether the diffuse depth limit is reached before continuing. */ - if (INTEGRATOR_STATE(state, path, diffuse_bounce) + 1 >= + if ((INTEGRATOR_STATE(state, path, diffuse_bounce) + 1) >= kernel_data.integrator.max_diffuse_bounce) return 0; /* Check whether the overall depth limit is reached before continuing. */ - if (INTEGRATOR_STATE(state, path, bounce) + 1 + vertex_count >= - kernel_data.integrator.max_bounce) + if ((INTEGRATOR_STATE(state, path, bounce) + vertex_count) >= kernel_data.integrator.max_bounce) return 0; /* Mark the manifold walk valid to turn off mollification regardless of how successful the walk diff --git a/intern/cycles/kernel/integrator/shade_background.h b/intern/cycles/kernel/integrator/shade_background.h index 62b3ce1c15c..72ecf67e8a0 100644 --- a/intern/cycles/kernel/integrator/shade_background.h +++ b/intern/cycles/kernel/integrator/shade_background.h @@ -48,7 +48,7 @@ ccl_device float3 integrator_eval_background_shader(KernelGlobals kg, PROFILING_SHADER(emission_sd->object, emission_sd->shader); PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL); - shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>( + shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_BACKGROUND>( kg, state, emission_sd, render_buffer, path_flag | PATH_RAY_EMISSION); L = shader_background_eval(emission_sd); diff --git a/intern/cycles/kernel/integrator/shade_surface.h b/intern/cycles/kernel/integrator/shade_surface.h index 859c314b088..ce1398859b7 100644 --- a/intern/cycles/kernel/integrator/shade_surface.h +++ b/intern/cycles/kernel/integrator/shade_surface.h @@ -137,7 +137,7 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg, # ifdef __MNEE__ int mnee_vertex_count = 0; - IF_KERNEL_NODES_FEATURE(RAYTRACE) + IF_KERNEL_FEATURE(MNEE) { if (ls.lamp != LAMP_NONE) { /* Is this a caustic light? */ @@ -253,13 +253,13 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg, # ifdef __MNEE__ if (mnee_vertex_count > 0) { INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transmission_bounce) = - INTEGRATOR_STATE(state, path, transmission_bounce) + mnee_vertex_count; + INTEGRATOR_STATE(state, path, transmission_bounce) + mnee_vertex_count - 1; INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, diffuse_bounce) = INTEGRATOR_STATE(state, path, diffuse_bounce) + 1; INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, - bounce) = INTEGRATOR_STATE(state, path, bounce) + 1 + mnee_vertex_count; + bounce) = INTEGRATOR_STATE(state, path, bounce) + mnee_vertex_count; } else # endif @@ -631,4 +631,12 @@ ccl_device_forceinline void integrator_shade_surface_raytrace( kg, state, render_buffer); } +ccl_device_forceinline void integrator_shade_surface_mnee( + KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer) +{ + integrator_shade_surface<(KERNEL_FEATURE_NODE_MASK_SURFACE & ~KERNEL_FEATURE_NODE_RAYTRACE) | + KERNEL_FEATURE_MNEE, + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE>(kg, state, render_buffer); +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/subsurface.h b/intern/cycles/kernel/integrator/subsurface.h index 2391cc2356d..b449f807290 100644 --- a/intern/cycles/kernel/integrator/subsurface.h +++ b/intern/cycles/kernel/integrator/subsurface.h @@ -174,9 +174,14 @@ ccl_device_inline bool subsurface_scatter(KernelGlobals kg, IntegratorState stat const int object_flags = intersection_get_object_flags(kg, &ss_isect.hits[0]); const bool use_caustics = kernel_data.integrator.use_caustics && (object_flags & SD_OBJECT_CAUSTICS); - const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE) || use_caustics; + const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE); - if (use_raytrace_kernel) { + if (use_caustics) { + INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE, + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, + shader); + } + else if (use_raytrace_kernel) { INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp index dbbce655ef7..39fcee1ac0d 100644 --- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp +++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp @@ -17,6 +17,7 @@ #include "kernel/types.h" #include "kernel/closure/alloc.h" #include "kernel/closure/bsdf_diffuse_ramp.h" +#include "kernel/closure/bsdf_util.h" // clang-format on CCL_NAMESPACE_BEGIN @@ -30,6 +31,8 @@ class DiffuseRampClosure : public CBSDFClosure { void setup(ShaderData *sd, uint32_t /* path_flag */, float3 weight) { + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); + DiffuseRampBsdf *bsdf = (DiffuseRampBsdf *)bsdf_alloc_osl( sd, sizeof(DiffuseRampBsdf), weight, ¶ms); diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp index fd602c8847e..972ed7e4a6d 100644 --- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp +++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp @@ -17,6 +17,7 @@ #include "kernel/types.h" #include "kernel/closure/alloc.h" #include "kernel/closure/bsdf_phong_ramp.h" +#include "kernel/closure/bsdf_util.h" // clang-format on CCL_NAMESPACE_BEGIN @@ -30,6 +31,8 @@ class PhongRampClosure : public CBSDFClosure { void setup(ShaderData *sd, uint32_t /* path_flag */, float3 weight) { + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); + PhongRampBsdf *bsdf = (PhongRampBsdf *)bsdf_alloc_osl( sd, sizeof(PhongRampBsdf), weight, ¶ms); diff --git a/intern/cycles/kernel/osl/bssrdf.cpp b/intern/cycles/kernel/osl/bssrdf.cpp index 29d2d93ce34..4b282fddad3 100644 --- a/intern/cycles/kernel/osl/bssrdf.cpp +++ b/intern/cycles/kernel/osl/bssrdf.cpp @@ -44,6 +44,8 @@ class CBSSRDFClosure : public CClosurePrimitive { void setup(ShaderData *sd, uint32_t path_flag, float3 weight) { + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); + if (method == u_burley) { alloc(sd, path_flag, weight, CLOSURE_BSSRDF_BURLEY_ID); } diff --git a/intern/cycles/kernel/osl/closures.cpp b/intern/cycles/kernel/osl/closures.cpp index 06b91ab9bb7..7c6b48154e4 100644 --- a/intern/cycles/kernel/osl/closures.cpp +++ b/intern/cycles/kernel/osl/closures.cpp @@ -180,6 +180,8 @@ class PrincipledSheenClosure : public CBSDFClosure { void setup(ShaderData *sd, uint32_t path_flag, float3 weight) { if (!skip(sd, path_flag, LABEL_DIFFUSE)) { + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); + PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf *)bsdf_alloc_osl( sd, sizeof(PrincipledSheenBsdf), weight, ¶ms); sd->flag |= (bsdf) ? bsdf_principled_sheen_setup(sd, bsdf) : 0; @@ -223,6 +225,8 @@ class PrincipledHairClosure : public CBSDFClosure { void setup(ShaderData *sd, uint32_t path_flag, float3 weight) { if (!skip(sd, path_flag, LABEL_GLOSSY)) { + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); + PrincipledHairBSDF *bsdf = (PrincipledHairBSDF *)alloc(sd, path_flag, weight); if (!bsdf) { return; @@ -282,6 +286,7 @@ class PrincipledClearcoatClosure : public CBSDFClosure { void setup(ShaderData *sd, uint32_t path_flag, float3 weight) { + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); if (!bsdf) { return; @@ -503,6 +508,8 @@ class MicrofacetClosure : public CBSDFClosure { return; } + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); + MicrofacetBsdf *bsdf = (MicrofacetBsdf *)bsdf_alloc_osl( sd, sizeof(MicrofacetBsdf), weight, ¶ms); @@ -601,6 +608,8 @@ class MicrofacetGGXFresnelClosure : public MicrofacetFresnelClosure { public: void setup(ShaderData *sd, uint32_t path_flag, float3 weight) { + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); if (!bsdf) { return; @@ -630,6 +639,8 @@ class MicrofacetGGXAnisoFresnelClosure : public MicrofacetFresnelClosure { public: void setup(ShaderData *sd, uint32_t path_flag, float3 weight) { + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); if (!bsdf) { return; @@ -695,6 +706,8 @@ class MicrofacetMultiGGXClosure : public MicrofacetMultiClosure { public: void setup(ShaderData *sd, uint32_t path_flag, float3 weight) { + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); if (!bsdf) { return; @@ -723,6 +736,8 @@ class MicrofacetMultiGGXAnisoClosure : public MicrofacetMultiClosure { public: void setup(ShaderData *sd, uint32_t path_flag, float3 weight) { + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); if (!bsdf) { return; @@ -755,6 +770,8 @@ class MicrofacetMultiGGXGlassClosure : public MicrofacetMultiClosure { void setup(ShaderData *sd, uint32_t path_flag, float3 weight) { + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); if (!bsdf) { return; @@ -819,6 +836,8 @@ class MicrofacetMultiGGXFresnelClosure : public MicrofacetMultiFresnelClosure { public: void setup(ShaderData *sd, uint32_t path_flag, float3 weight) { + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); if (!bsdf) { return; @@ -849,6 +868,8 @@ class MicrofacetMultiGGXAnisoFresnelClosure : public MicrofacetMultiFresnelClosu public: void setup(ShaderData *sd, uint32_t path_flag, float3 weight) { + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); if (!bsdf) { return; @@ -883,6 +904,8 @@ class MicrofacetMultiGGXGlassFresnelClosure : public MicrofacetMultiFresnelClosu void setup(ShaderData *sd, uint32_t path_flag, float3 weight) { + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); if (!bsdf) { return; diff --git a/intern/cycles/kernel/osl/closures.h b/intern/cycles/kernel/osl/closures.h index 6225644a8e6..e10a3d88a04 100644 --- a/intern/cycles/kernel/osl/closures.h +++ b/intern/cycles/kernel/osl/closures.h @@ -114,6 +114,7 @@ class CBSDFClosure : public CClosurePrimitive { void setup(ShaderData *sd, uint32_t path_flag, float3 weight) \ { \ if (!skip(sd, path_flag, TYPE)) { \ + params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); \ structname *bsdf = (structname *)bsdf_alloc_osl(sd, sizeof(structname), weight, ¶ms); \ sd->flag |= (bsdf) ? bsdf_##lower##_setup(bsdf) : 0; \ } \ diff --git a/intern/cycles/kernel/svm/blackbody.h b/intern/cycles/kernel/svm/blackbody.h index af59c2fe747..774fa16b384 100644 --- a/intern/cycles/kernel/svm/blackbody.h +++ b/intern/cycles/kernel/svm/blackbody.h @@ -24,6 +24,7 @@ ccl_device_noinline void svm_node_blackbody(KernelGlobals kg, float temperature = stack_load_float(stack, temperature_offset); float3 color_rgb = rec709_to_rgb(kg, svm_math_blackbody_color_rec709(temperature)); + color_rgb = max(color_rgb, zero_float3()); stack_store_float3(stack, col_offset, color_rgb); } diff --git a/intern/cycles/kernel/svm/displace.h b/intern/cycles/kernel/svm/displace.h index 56fb5c3f9b0..128023263fd 100644 --- a/intern/cycles/kernel/svm/displace.h +++ b/intern/cycles/kernel/svm/displace.h @@ -188,6 +188,7 @@ ccl_device_noinline int svm_node_vector_displacement( else { stack_store_float3(stack, displacement_offset, zero_float3()); + (void)data_node; } return offset; diff --git a/intern/cycles/kernel/svm/math_util.h b/intern/cycles/kernel/svm/math_util.h index 9f2d9561e26..89bd4a501a7 100644 --- a/intern/cycles/kernel/svm/math_util.h +++ b/intern/cycles/kernel/svm/math_util.h @@ -192,28 +192,26 @@ ccl_device float svm_math(NodeMathType type, float a, float b, float c) ccl_device float3 svm_math_blackbody_color_rec709(float t) { /* Calculate color in range 800..12000 using an approximation - * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B - * Max absolute error for RGB is (0.00095, 0.00077, 0.00057), - * which is enough to get the same 8 bit/channel color. - */ + * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B. + * + * The result of this can be negative to support gamut wider than + * than rec.709, just needs to be clamped. */ if (t >= 12000.0f) { - return make_float3(0.826270103f, 0.994478524f, 1.56626022f); + return make_float3(0.8262954810464208f, 0.9945080501520986f, 1.566307710274283f); } - else if (t < 965.0f) { - /* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */ - return make_float3(4.70366907f, 0.0f, 0.0f); + else if (t < 800.0f) { + /* Arbitrary lower limit where light is very dim, matching OSL. */ + return make_float3(5.413294490189271f, -0.20319390035873933f, -0.0822535242887164f); } - /* Manually align for readability. */ - /* clang-format off */ - int i = (t >= 6365.0f) ? 5 : - (t >= 3315.0f) ? 4 : - (t >= 1902.0f) ? 3 : - (t >= 1449.0f) ? 2 : - (t >= 1167.0f) ? 1 : + int i = (t >= 6365.0f) ? 6 : + (t >= 3315.0f) ? 5 : + (t >= 1902.0f) ? 4 : + (t >= 1449.0f) ? 3 : + (t >= 1167.0f) ? 2 : + (t >= 965.0f) ? 1 : 0; - /* clang-format on */ ccl_constant float *r = blackbody_table_r[i]; ccl_constant float *g = blackbody_table_g[i]; diff --git a/intern/cycles/kernel/tables.h b/intern/cycles/kernel/tables.h index f826cc5c5ef..c1fdbba3fa7 100644 --- a/intern/cycles/kernel/tables.h +++ b/intern/cycles/kernel/tables.h @@ -4,30 +4,33 @@ /* clang-format off */ ccl_inline_constant float blackbody_table_r[][3] = { - {2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f}, - {3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f}, - {4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f}, - {4.66849800e+03f, 2.85655028e-05f, 1.29075375e-01f}, - {4.60124770e+03f, 2.89727618e-05f, 1.48001316e-01f}, - {3.78765709e+03f, 9.36026367e-06f, 3.98995841e-01f} + {1.61919106e+03f, -2.05010916e-03f, 5.02995757e+00f}, + {2.48845471e+03f, -1.11330907e-03f, 3.22621544e+00f}, + {3.34143193e+03f, -4.86551192e-04f, 1.76486769e+00f}, + {4.09461742e+03f, -1.27446582e-04f, 7.25731635e-01f}, + {4.67028036e+03f, 2.91258199e-05f, 1.26703442e-01f}, + {4.59509185e+03f, 2.87495649e-05f, 1.50345020e-01f}, + {3.78717450e+03f, 9.35907826e-06f, 3.99075871e-01f} }; ccl_inline_constant float blackbody_table_g[][3] = { - {-7.50343014e+02f, 3.15679613e-04f, 4.73464526e-01f}, - {-1.00402363e+03f, 1.29189794e-04f, 9.08181524e-01f}, - {-1.22075471e+03f, 2.56245413e-05f, 1.20753416e+00f}, - {-1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f}, - {-1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f}, - {-5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f} + {-4.88999748e+02f, 6.04330754e-04f, -7.55807526e-02f}, + {-7.55994277e+02f, 3.16730098e-04f, 4.78306139e-01f}, + {-1.02363977e+03f, 1.20223470e-04f, 9.36662319e-01f}, + {-1.26571316e+03f, 4.87340896e-06f, 1.27054498e+00f}, + {-1.42529332e+03f, -4.01150431e-05f, 1.43972784e+00f}, + {-1.17554822e+03f, -2.16378048e-05f, 1.30408023e+00f}, + {-5.00799571e+02f, -4.59832026e-06f, 1.09098763e+00f} }; ccl_inline_constant float blackbody_table_b[][4] = { - {0.0f, 0.0f, 0.0f, 0.0f}, /* zeros should be optimized by compiler */ - {0.0f, 0.0f, 0.0f, 0.0f}, - {0.0f, 0.0f, 0.0f, 0.0f}, - {-2.02524603e-11f, 1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f}, - {-2.22463426e-13f, -1.55078698e-08f, 3.81675160e-04f, -7.30646033e-01f}, - {6.72595954e-13f, -2.73059993e-08f, 4.24068546e-04f, -7.52204323e-01f} + {5.96945309e-11f, -4.85742887e-08f, -9.70622247e-05f, -4.07936148e-03f}, + {2.40430366e-11f, 5.55021075e-08f, -1.98503712e-04f, 2.89312858e-02f}, + {-1.40949732e-11f, 1.89878968e-07f, -3.56632824e-04f, 9.10767778e-02f}, + {-3.61460868e-11f, 2.84822009e-07f, -4.93211319e-04f, 1.56723440e-01f}, + {-1.97075738e-11f, 1.75359352e-07f, -2.50542825e-04f, -2.22783266e-02f}, + {-1.61997957e-13f, -1.64216008e-08f, 3.86216271e-04f, -7.38077418e-01f}, + {6.72650283e-13f, -2.73078809e-08f, 4.24098264e-04f, -7.52335691e-01f} }; ccl_inline_constant float cie_colour_match[][3] = { diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index 297e88c718c..a76effdd952 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -1572,6 +1572,7 @@ typedef enum DeviceKernel : int { DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, + DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME, DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW, DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL, @@ -1689,6 +1690,9 @@ enum KernelFeatureFlag : uint32_t { KERNEL_FEATURE_AO_PASS = (1U << 25U), KERNEL_FEATURE_AO_ADDITIVE = (1U << 26U), KERNEL_FEATURE_AO = (KERNEL_FEATURE_AO_PASS | KERNEL_FEATURE_AO_ADDITIVE), + + /* MNEE. */ + KERNEL_FEATURE_MNEE = (1U << 27U), }; /* Shader node feature mask, to specialize shader evaluation for kernels. */ @@ -1696,6 +1700,8 @@ enum KernelFeatureFlag : uint32_t { #define KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT \ (KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VORONOI_EXTRA | \ KERNEL_FEATURE_NODE_LIGHT_PATH) +#define KERNEL_FEATURE_NODE_MASK_SURFACE_BACKGROUND \ + (KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT | KERNEL_FEATURE_NODE_AOV) #define KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW \ (KERNEL_FEATURE_NODE_BSDF | KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VOLUME | \ KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE | \ @@ -1714,9 +1720,12 @@ enum KernelFeatureFlag : uint32_t { * are different depending on the main, shadow or null path. For GPU we don't have * C++17 everywhere so can't use it. */ #ifdef __KERNEL_CPU__ +# define IF_KERNEL_FEATURE(feature) \ + if constexpr ((node_feature_mask & (KERNEL_FEATURE_##feature)) != 0U) # define IF_KERNEL_NODES_FEATURE(feature) \ if constexpr ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U) #else +# define IF_KERNEL_FEATURE(feature) if ((node_feature_mask & (KERNEL_FEATURE_##feature)) != 0U) # define IF_KERNEL_NODES_FEATURE(feature) \ if ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U) #endif diff --git a/intern/cycles/scene/attribute.cpp b/intern/cycles/scene/attribute.cpp index df01189a54b..d6b4c0240f6 100644 --- a/intern/cycles/scene/attribute.cpp +++ b/intern/cycles/scene/attribute.cpp @@ -661,6 +661,26 @@ Attribute *AttributeSet::find(AttributeStandard std) const return NULL; } +Attribute *AttributeSet::find_matching(const Attribute &other) +{ + for (Attribute &attr : attributes) { + if (attr.name != other.name) { + continue; + } + if (attr.std != other.std) { + continue; + } + if (attr.type != other.type) { + continue; + } + if (attr.element != other.element) { + continue; + } + return &attr; + } + return nullptr; +} + void AttributeSet::remove(AttributeStandard std) { Attribute *attr = find(std); @@ -729,32 +749,24 @@ void AttributeSet::clear(bool preserve_voxel_data) void AttributeSet::update(AttributeSet &&new_attributes) { - /* add or update old_attributes based on the new_attributes */ - foreach (Attribute &attr, new_attributes.attributes) { - Attribute *nattr = add(attr.name, attr.type, attr.element); - nattr->std = attr.std; - nattr->set_data_from(std::move(attr)); - } - - /* remove any attributes not on new_attributes */ + /* Remove any attributes not on new_attributes. */ list<Attribute>::iterator it; for (it = attributes.begin(); it != attributes.end();) { - if (it->std != ATTR_STD_NONE) { - if (new_attributes.find(it->std) == nullptr) { - remove(it++); - continue; - } - } - else if (it->name != "") { - if (new_attributes.find(it->name) == nullptr) { - remove(it++); - continue; - } + const Attribute &old_attr = *it; + if (new_attributes.find_matching(old_attr) == nullptr) { + remove(it++); + continue; } - it++; } + /* Add or update old_attributes based on the new_attributes. */ + foreach (Attribute &attr, new_attributes.attributes) { + Attribute *nattr = add(attr.name, attr.type, attr.element); + nattr->std = attr.std; + nattr->set_data_from(std::move(attr)); + } + /* If all attributes were replaced, transform is no longer applied. */ geometry->transform_applied = false; } diff --git a/intern/cycles/scene/attribute.h b/intern/cycles/scene/attribute.h index fd13b8ff6de..7f8cbf32049 100644 --- a/intern/cycles/scene/attribute.h +++ b/intern/cycles/scene/attribute.h @@ -194,6 +194,7 @@ class AttributeSet { void remove(AttributeStandard std); Attribute *find(AttributeRequest &req); + Attribute *find_matching(const Attribute &other); void remove(Attribute *attribute); diff --git a/intern/cycles/scene/image.cpp b/intern/cycles/scene/image.cpp index c61ad1f1d71..1b44162351a 100644 --- a/intern/cycles/scene/image.cpp +++ b/intern/cycles/scene/image.cpp @@ -64,6 +64,10 @@ const char *name_from_type(ImageDataType type) return "nanovdb_float"; case IMAGE_DATA_TYPE_NANOVDB_FLOAT3: return "nanovdb_float3"; + case IMAGE_DATA_TYPE_NANOVDB_FPN: + return "nanovdb_fpn"; + case IMAGE_DATA_TYPE_NANOVDB_FP16: + return "nanovdb_fp16"; case IMAGE_DATA_NUM_TYPES: assert(!"System enumerator type, should never be used"); return ""; @@ -268,17 +272,12 @@ void ImageMetaData::detect_colorspace() compress_as_srgb = true; } else { - /* Always compress non-raw 8bit images as scene linear + sRGB, as a - * heuristic to keep memory usage the same without too much data loss - * due to quantization in common cases. */ - compress_as_srgb = (type == IMAGE_DATA_TYPE_BYTE || type == IMAGE_DATA_TYPE_BYTE4); - /* If colorspace conversion needed, use half instead of short so we can * represent HDR values that might result from conversion. */ - if (type == IMAGE_DATA_TYPE_USHORT) { + if (type == IMAGE_DATA_TYPE_BYTE || type == IMAGE_DATA_TYPE_USHORT) { type = IMAGE_DATA_TYPE_HALF; } - else if (type == IMAGE_DATA_TYPE_USHORT4) { + else if (type == IMAGE_DATA_TYPE_BYTE4 || type == IMAGE_DATA_TYPE_USHORT4) { type = IMAGE_DATA_TYPE_HALF4; } } @@ -378,7 +377,9 @@ void ImageManager::load_image_metadata(Image *img) metadata.detect_colorspace(); assert(features.has_nanovdb || (metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT || - metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3)); + metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3 || + metadata.type != IMAGE_DATA_TYPE_NANOVDB_FPN || + metadata.type != IMAGE_DATA_TYPE_NANOVDB_FP16)); img->need_metadata = false; } @@ -796,7 +797,8 @@ void ImageManager::device_load_image(Device *device, Scene *scene, int slot, Pro } } #ifdef WITH_NANOVDB - else if (type == IMAGE_DATA_TYPE_NANOVDB_FLOAT || type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { + else if (type == IMAGE_DATA_TYPE_NANOVDB_FLOAT || type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3 || + type == IMAGE_DATA_TYPE_NANOVDB_FPN || type == IMAGE_DATA_TYPE_NANOVDB_FP16) { thread_scoped_lock device_lock(device_mutex); void *pixels = img->mem->alloc(img->metadata.byte_size, 0); diff --git a/intern/cycles/scene/image_oiio.cpp b/intern/cycles/scene/image_oiio.cpp index 3f825afbe90..09676455308 100644 --- a/intern/cycles/scene/image_oiio.cpp +++ b/intern/cycles/scene/image_oiio.cpp @@ -94,10 +94,11 @@ bool OIIOImageLoader::load_metadata(const ImageDeviceFeatures & /*features*/, template<TypeDesc::BASETYPE FileFormat, typename StorageType> static void oiio_load_pixels(const ImageMetaData &metadata, const unique_ptr<ImageInput> &in, + const bool associate_alpha, StorageType *pixels) { - const int width = metadata.width; - const int height = metadata.height; + const size_t width = metadata.width; + const size_t height = metadata.height; const int depth = metadata.depth; const int components = metadata.channels; @@ -105,12 +106,12 @@ static void oiio_load_pixels(const ImageMetaData &metadata, StorageType *readpixels = pixels; vector<StorageType> tmppixels; if (components > 4) { - tmppixels.resize(((size_t)width) * height * components); + tmppixels.resize(width * height * components); readpixels = &tmppixels[0]; } if (depth <= 1) { - size_t scanlinesize = ((size_t)width) * components * sizeof(StorageType); + size_t scanlinesize = width * components * sizeof(StorageType); in->read_image(FileFormat, (uchar *)readpixels + (height - 1) * scanlinesize, AutoStride, @@ -122,7 +123,7 @@ static void oiio_load_pixels(const ImageMetaData &metadata, } if (components > 4) { - size_t dimensions = ((size_t)width) * height; + size_t dimensions = width * height; for (size_t i = dimensions - 1, pixel = 0; pixel < dimensions; pixel++, i--) { pixels[i * 4 + 3] = tmppixels[i * components + 3]; pixels[i * 4 + 2] = tmppixels[i * components + 2]; @@ -137,7 +138,7 @@ static void oiio_load_pixels(const ImageMetaData &metadata, if (cmyk) { const StorageType one = util_image_cast_from_float<StorageType>(1.0f); - const size_t num_pixels = ((size_t)width) * height * depth; + const size_t num_pixels = width * height * depth; for (size_t i = num_pixels - 1, pixel = 0; pixel < num_pixels; pixel++, i--) { float c = util_image_cast_to_float(pixels[i * 4 + 0]); float m = util_image_cast_to_float(pixels[i * 4 + 1]); @@ -149,6 +150,16 @@ static void oiio_load_pixels(const ImageMetaData &metadata, pixels[i * 4 + 3] = one; } } + + if (components == 4 && associate_alpha) { + size_t dimensions = width * height; + for (size_t i = dimensions - 1, pixel = 0; pixel < dimensions; pixel++, i--) { + const StorageType alpha = pixels[i * 4 + 3]; + pixels[i * 4 + 0] = util_image_multiply_native(pixels[i * 4 + 0], alpha); + pixels[i * 4 + 1] = util_image_multiply_native(pixels[i * 4 + 1], alpha); + pixels[i * 4 + 2] = util_image_multiply_native(pixels[i * 4 + 2], alpha); + } + } } bool OIIOImageLoader::load_pixels(const ImageMetaData &metadata, @@ -172,33 +183,41 @@ bool OIIOImageLoader::load_pixels(const ImageMetaData &metadata, ImageSpec spec = ImageSpec(); ImageSpec config = ImageSpec(); - if (!associate_alpha) { - config.attribute("oiio:UnassociatedAlpha", 1); - } + /* Load without automatic OIIO alpha conversion, we do it ourselves. OIIO + * will associate alpha in the the 8bit buffer for PNGs, which leads to too + * much precision loss when we load it as half float to do a colorspace + * transform. */ + config.attribute("oiio:UnassociatedAlpha", 1); if (!in->open(filepath.string(), spec, config)) { return false; } + const bool do_associate_alpha = associate_alpha && + spec.get_int_attribute("oiio:UnassociatedAlpha", 0); + switch (metadata.type) { case IMAGE_DATA_TYPE_BYTE: case IMAGE_DATA_TYPE_BYTE4: - oiio_load_pixels<TypeDesc::UINT8, uchar>(metadata, in, (uchar *)pixels); + oiio_load_pixels<TypeDesc::UINT8, uchar>(metadata, in, do_associate_alpha, (uchar *)pixels); break; case IMAGE_DATA_TYPE_USHORT: case IMAGE_DATA_TYPE_USHORT4: - oiio_load_pixels<TypeDesc::USHORT, uint16_t>(metadata, in, (uint16_t *)pixels); + oiio_load_pixels<TypeDesc::USHORT, uint16_t>( + metadata, in, do_associate_alpha, (uint16_t *)pixels); break; case IMAGE_DATA_TYPE_HALF: case IMAGE_DATA_TYPE_HALF4: - oiio_load_pixels<TypeDesc::HALF, half>(metadata, in, (half *)pixels); + oiio_load_pixels<TypeDesc::HALF, half>(metadata, in, do_associate_alpha, (half *)pixels); break; case IMAGE_DATA_TYPE_FLOAT: case IMAGE_DATA_TYPE_FLOAT4: - oiio_load_pixels<TypeDesc::FLOAT, float>(metadata, in, (float *)pixels); + oiio_load_pixels<TypeDesc::FLOAT, float>(metadata, in, do_associate_alpha, (float *)pixels); break; case IMAGE_DATA_TYPE_NANOVDB_FLOAT: case IMAGE_DATA_TYPE_NANOVDB_FLOAT3: + case IMAGE_DATA_TYPE_NANOVDB_FPN: + case IMAGE_DATA_TYPE_NANOVDB_FP16: case IMAGE_DATA_NUM_TYPES: break; } diff --git a/intern/cycles/scene/image_vdb.cpp b/intern/cycles/scene/image_vdb.cpp index b6f0911fa2c..2209be60a97 100644 --- a/intern/cycles/scene/image_vdb.cpp +++ b/intern/cycles/scene/image_vdb.cpp @@ -44,14 +44,30 @@ struct ToDenseOp { # ifdef WITH_NANOVDB struct ToNanoOp { nanovdb::GridHandle<> nanogrid; + int precision; template<typename GridType, typename FloatGridType, typename FloatDataType, int channels> bool operator()(const openvdb::GridBase::ConstPtr &grid) { if constexpr (!std::is_same_v<GridType, openvdb::MaskGrid>) { try { - nanogrid = nanovdb::openToNanoVDB( - FloatGridType(*openvdb::gridConstPtrCast<GridType>(grid))); + FloatGridType floatgrid(*openvdb::gridConstPtrCast<GridType>(grid)); + if constexpr (std::is_same_v<FloatGridType, openvdb::FloatGrid>) { + if (precision == 0) { + nanogrid = nanovdb::openToNanoVDB<nanovdb::HostBuffer, + typename FloatGridType::TreeType, + nanovdb::FpN>(floatgrid); + return true; + } + else if (precision == 16) { + nanogrid = nanovdb::openToNanoVDB<nanovdb::HostBuffer, + typename FloatGridType::TreeType, + nanovdb::Fp16>(floatgrid); + return true; + } + } + + nanogrid = nanovdb::openToNanoVDB(floatgrid); } catch (const std::exception &e) { VLOG(1) << "Error converting OpenVDB to NanoVDB grid: " << e.what(); @@ -98,10 +114,13 @@ bool VDBImageLoader::load_metadata(const ImageDeviceFeatures &features, ImageMet # ifdef WITH_NANOVDB if (features.has_nanovdb) { /* NanoVDB expects no inactive leaf nodes. */ - /*openvdb::FloatGrid &pruned_grid = *openvdb::gridPtrCast<openvdb::FloatGrid>(grid); +# if 0 + openvdb::FloatGrid &pruned_grid = *openvdb::gridPtrCast<openvdb::FloatGrid>(grid); openvdb::tools::pruneInactive(pruned_grid.tree()); - nanogrid = nanovdb::openToNanoVDB(pruned_grid);*/ + nanogrid = nanovdb::openToNanoVDB(pruned_grid); +# endif ToNanoOp op; + op.precision = precision; if (!openvdb::grid_type_operation(grid, op)) { return false; } @@ -124,7 +143,15 @@ bool VDBImageLoader::load_metadata(const ImageDeviceFeatures &features, ImageMet if (nanogrid) { metadata.byte_size = nanogrid.size(); if (metadata.channels == 1) { - metadata.type = IMAGE_DATA_TYPE_NANOVDB_FLOAT; + if (precision == 0) { + metadata.type = IMAGE_DATA_TYPE_NANOVDB_FPN; + } + else if (precision == 16) { + metadata.type = IMAGE_DATA_TYPE_NANOVDB_FP16; + } + else { + metadata.type = IMAGE_DATA_TYPE_NANOVDB_FLOAT; + } } else { metadata.type = IMAGE_DATA_TYPE_NANOVDB_FLOAT3; diff --git a/intern/cycles/scene/image_vdb.h b/intern/cycles/scene/image_vdb.h index a5fd51915ef..ea5f6b0b3d9 100644 --- a/intern/cycles/scene/image_vdb.h +++ b/intern/cycles/scene/image_vdb.h @@ -51,6 +51,7 @@ class VDBImageLoader : public ImageLoader { #endif #ifdef WITH_NANOVDB nanovdb::GridHandle<> nanogrid; + int precision = 0; #endif }; diff --git a/intern/cycles/scene/object.cpp b/intern/cycles/scene/object.cpp index 8015be6393b..ddd89a16640 100644 --- a/intern/cycles/scene/object.cpp +++ b/intern/cycles/scene/object.cpp @@ -327,9 +327,11 @@ float Object::compute_volume_step_size() const /* Auto detect step size. */ float3 size = one_float3(); #ifdef WITH_NANOVDB - /* Dimensions were not applied to image transform with NanOVDB (see image_vdb.cpp) */ + /* Dimensions were not applied to image transform with NanoVDB (see image_vdb.cpp) */ if (metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && - metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) + metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3 && + metadata.type != IMAGE_DATA_TYPE_NANOVDB_FPN && + metadata.type != IMAGE_DATA_TYPE_NANOVDB_FP16) #endif size /= make_float3(metadata.width, metadata.height, metadata.depth); diff --git a/intern/cycles/scene/scene.cpp b/intern/cycles/scene/scene.cpp index b35242139ea..8b5604eba72 100644 --- a/intern/cycles/scene/scene.cpp +++ b/intern/cycles/scene/scene.cpp @@ -550,7 +550,7 @@ void Scene::update_kernel_features() dscene.data.integrator.use_caustics = false; if (has_caustics_caster && has_caustics_receiver && has_caustics_light) { dscene.data.integrator.use_caustics = true; - kernel_features |= KERNEL_FEATURE_NODE_RAYTRACE; + kernel_features |= KERNEL_FEATURE_MNEE; } if (bake_manager->get_baking()) { @@ -597,6 +597,7 @@ static void log_kernel_features(const uint features) << "\n"; VLOG(2) << "Use Shader Raytrace " << string_from_bool(features & KERNEL_FEATURE_NODE_RAYTRACE) << "\n"; + VLOG(2) << "Use MNEE" << string_from_bool(features & KERNEL_FEATURE_MNEE) << "\n"; VLOG(2) << "Use Transparent " << string_from_bool(features & KERNEL_FEATURE_TRANSPARENT) << "\n"; VLOG(2) << "Use Denoising " << string_from_bool(features & KERNEL_FEATURE_DENOISING) << "\n"; VLOG(2) << "Use Path Tracing " << string_from_bool(features & KERNEL_FEATURE_PATH_TRACING) diff --git a/intern/cycles/scene/shader_graph.cpp b/intern/cycles/scene/shader_graph.cpp index d44e12f5fab..f25d0b7c7b9 100644 --- a/intern/cycles/scene/shader_graph.cpp +++ b/intern/cycles/scene/shader_graph.cpp @@ -888,7 +888,7 @@ void ShaderGraph::default_inputs(bool do_osl) void ShaderGraph::refine_bump_nodes() { - /* we transverse the node graph looking for bump nodes, when we find them, + /* We transverse the node graph looking for bump nodes, when we find them, * like in bump_from_displacement(), we copy the sub-graph defined from "bump" * input to the inputs "center","dx" and "dy" What is in "bump" input is moved * to "center" input. */ @@ -898,18 +898,18 @@ void ShaderGraph::refine_bump_nodes() ShaderInput *bump_input = node->input("Height"); ShaderNodeSet nodes_bump; - /* make 2 extra copies of the subgraph defined in Bump input */ + /* Make 2 extra copies of the subgraph defined in Bump input. */ ShaderNodeMap nodes_dx; ShaderNodeMap nodes_dy; - /* find dependencies for the given input */ + /* Find dependencies for the given input. */ find_dependencies(nodes_bump, bump_input); copy_nodes(nodes_bump, nodes_dx); copy_nodes(nodes_bump, nodes_dy); - /* mark nodes to indicate they are use for bump computation, so - that any texture coordinates are shifted by dx/dy when sampling */ + /* Mark nodes to indicate they are use for bump computation, so + * that any texture coordinates are shifted by dx/dy when sampling. */ foreach (ShaderNode *node, nodes_bump) node->bump = SHADER_BUMP_CENTER; foreach (NodePair &pair, nodes_dx) @@ -924,7 +924,7 @@ void ShaderGraph::refine_bump_nodes() connect(out_dx, node->input("SampleX")); connect(out_dy, node->input("SampleY")); - /* add generated nodes */ + /* Add generated nodes. */ foreach (NodePair &pair, nodes_dx) add(pair.second); foreach (NodePair &pair, nodes_dy) diff --git a/intern/cycles/scene/shader_nodes.cpp b/intern/cycles/scene/shader_nodes.cpp index 03c152928d5..3b58556f601 100644 --- a/intern/cycles/scene/shader_nodes.cpp +++ b/intern/cycles/scene/shader_nodes.cpp @@ -5882,7 +5882,7 @@ void BlackbodyNode::constant_fold(const ConstantFolder &folder) if (folder.all_inputs_constant()) { const float3 rgb_rec709 = svm_math_blackbody_color_rec709(temperature); const float3 rgb = folder.scene->shader_manager->rec709_to_scene_linear(rgb_rec709); - folder.make_constant(rgb); + folder.make_constant(max(rgb, zero_float3())); } } diff --git a/intern/cycles/session/denoising.cpp b/intern/cycles/session/denoising.cpp index ff9ca1b4345..a9377d412e8 100644 --- a/intern/cycles/session/denoising.cpp +++ b/intern/cycles/session/denoising.cpp @@ -90,7 +90,7 @@ static vector<ChannelMapping> output_channels() return map; } -/* Renderlayer Handling */ +/* Render-layer Handling. */ bool DenoiseImageLayer::detect_denoising_channels() { diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp index 143628f1e30..dac36ab0135 100644 --- a/intern/cycles/test/render_graph_finalize_test.cpp +++ b/intern/cycles/test/render_graph_finalize_test.cpp @@ -946,7 +946,7 @@ TEST_F(RenderGraph, constant_fold_bright_contrast) TEST_F(RenderGraph, constant_fold_blackbody) { EXPECT_ANY_MESSAGE(log); - CORRECT_INFO_MESSAGE(log, "Folding Blackbody::Color to constant (3.94163, 0.226523, 0)."); + CORRECT_INFO_MESSAGE(log, "Folding Blackbody::Color to constant (3.96553, 0.227897, 0)."); builder .add_node(ShaderNodeBuilder<BlackbodyNode>(graph, "Blackbody").set("Temperature", 1200.0f)) diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h index af33264e044..c668638eb02 100644 --- a/intern/cycles/util/half.h +++ b/intern/cycles/util/half.h @@ -74,9 +74,9 @@ struct half4 { ccl_device_inline half float_to_half_image(float f) { #if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__) - return half(f); + return half(min(f, 65504.0f)); #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) - return __float2half(f); + return __float2half(min(f, 65504.0f)); #else const uint u = __float_as_uint(f); /* Sign bit, shifted to its position. */ @@ -139,9 +139,9 @@ ccl_device_inline float4 half4_to_float4_image(const half4 h) ccl_device_inline half float_to_half_display(const float f) { #if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__) - return half(f); + return half(min(f, 65504.0f)); #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) - return __float2half(f); + return __float2half(min(f, 65504.0f)); #else const int x = __float_as_int((f > 0.0f) ? ((f < 65504.0f) ? f : 65504.0f) : 0.0f); const int absolute = x & 0x7FFFFFFF; diff --git a/intern/cycles/util/image.h b/intern/cycles/util/image.h index 9348125d072..17446a83e59 100644 --- a/intern/cycles/util/image.h +++ b/intern/cycles/util/image.h @@ -78,6 +78,26 @@ template<> inline half util_image_cast_from_float(float value) return float_to_half_image(value); } +/* Multiply image pixels in native data format. */ +template<typename T> inline T util_image_multiply_native(T a, T b); + +template<> inline float util_image_multiply_native(float a, float b) +{ + return a * b; +} +template<> inline uchar util_image_multiply_native(uchar a, uchar b) +{ + return ((uint32_t)a * (uint32_t)b) / 255; +} +template<> inline uint16_t util_image_multiply_native(uint16_t a, uint16_t b) +{ + return ((uint32_t)a * (uint32_t)b) / 65535; +} +template<> inline half util_image_multiply_native(half a, half b) +{ + return float_to_half_image(half_to_float_image(a) * half_to_float_image(b)); +} + CCL_NAMESPACE_END #endif /* __UTIL_IMAGE_H__ */ diff --git a/intern/cycles/util/texture.h b/intern/cycles/util/texture.h index e8bb058a3c9..90e842933c2 100644 --- a/intern/cycles/util/texture.h +++ b/intern/cycles/util/texture.h @@ -37,6 +37,8 @@ typedef enum ImageDataType { IMAGE_DATA_TYPE_USHORT = 7, IMAGE_DATA_TYPE_NANOVDB_FLOAT = 8, IMAGE_DATA_TYPE_NANOVDB_FLOAT3 = 9, + IMAGE_DATA_TYPE_NANOVDB_FPN = 10, + IMAGE_DATA_TYPE_NANOVDB_FP16 = 11, IMAGE_DATA_NUM_TYPES } ImageDataType; |