diff options
Diffstat (limited to 'intern/cycles')
88 files changed, 2036 insertions, 1360 deletions
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt index 4919b99cfe0..63d89221d20 100644 --- a/intern/cycles/blender/CMakeLists.txt +++ b/intern/cycles/blender/CMakeLists.txt @@ -128,10 +128,6 @@ if(WITH_OPENIMAGEDENOISE) ) endif() -if(WITH_EXPERIMENTAL_FEATURES) - add_definitions(-DWITH_NEW_CURVES_TYPE) -endif() - blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}" "${LIB}") add_dependencies(bf_intern_cycles bf_rna) diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py index cc6d574da99..e1f08c07eaf 100644 --- a/intern/cycles/blender/addon/presets.py +++ b/intern/cycles/blender/addon/presets.py @@ -84,10 +84,36 @@ class AddPresetViewportSampling(AddPresetBase, Operator): preset_subdir = "cycles/viewport_sampling" +class AddPresetPerformance(AddPresetBase, Operator): + '''Add an Performance Preset''' + bl_idname = "render.cycles_performance_preset_add" + bl_label = "Add Performance Preset" + preset_menu = "CYCLES_PT_performance_presets" + + preset_defines = [ + "render = bpy.context.scene.render" + "cycles = bpy.context.scene.cycles" + ] + + preset_values = [ + "render.threads_mode", + "render.use_persistent_data", + "cycles.debug_use_spatial_splits", + "cycles.debug_use_compact_bvh", + "cycles.debug_use_hair_bvh", + "cycles.debug_bvh_time_steps", + "cycles.use_auto_tile", + "cycles.tile_size", + ] + + preset_subdir = "cycles/performance" + + classes = ( AddPresetIntegrator, AddPresetSampling, AddPresetViewportSampling, + AddPresetPerformance, ) diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index e88b65b5119..2c926893f9d 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -693,7 +693,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): debug_use_compact_bvh: BoolProperty( name="Use Compact BVH", description="Use compact BVH structure (uses less ram but renders slower)", - default=True, + default=False, ) debug_bvh_time_steps: IntProperty( name="BVH Time Steps", @@ -1560,7 +1560,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): if sys.platform.startswith("win"): col.label(text="and Windows driver version 101.1660 or newer", icon='BLANK1') elif sys.platform.startswith("linux"): - col.label(text="and Linux driver version xx.xx.28000 or newer", icon='BLANK1') + col.label(text="and Linux driver version xx.xx.23570 or newer", icon='BLANK1') elif device_type == 'METAL': col.label(text="Requires Apple Silicon with macOS 12.2 or newer", icon='BLANK1') col.label(text="or AMD with macOS 12.3 or newer", icon='BLANK1') diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 77da3f36685..0fead409866 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -43,6 +43,12 @@ class CYCLES_PT_integrator_presets(CyclesPresetPanel): preset_add_operator = "render.cycles_integrator_preset_add" +class CYCLES_PT_performance_presets(CyclesPresetPanel): + bl_label = "Performance Presets" + preset_subdir = "cycles/performance" + preset_add_operator = "render.cycles_performance_preset_add" + + class CyclesButtonsPanel: bl_space_type = "PROPERTIES" bl_region_type = "WINDOW" @@ -624,6 +630,9 @@ class CYCLES_RENDER_PT_performance(CyclesButtonsPanel, Panel): bl_label = "Performance" bl_options = {'DEFAULT_CLOSED'} + def draw_header_preset(self, context): + CYCLES_PT_performance_presets.draw_panel_header(self.layout) + def draw(self, context): pass @@ -943,6 +952,8 @@ class CYCLES_CAMERA_PT_dof(CyclesButtonsPanel, Panel): col = split.column() col.prop(dof, "focus_object", text="Focus Object") + if dof.focus_object and dof.focus_object.type == 'ARMATURE': + col.prop_search(dof, "focus_subtarget", dof.focus_object.data, "bones", text="Focus Bone") sub = col.row() sub.active = dof.focus_object is None @@ -1202,7 +1213,7 @@ class CYCLES_OBJECT_PT_lightgroup(CyclesButtonsPanel, Panel): sub.prop_search(ob, "lightgroup", view_layer, "lightgroups", text="Light Group", results_are_suggestions=True) sub = row.column(align=True) - sub.active = bool(ob.lightgroup) and not any(lg.name == ob.lightgroup for lg in view_layer.lightgroups) + sub.enabled = bool(ob.lightgroup) and not any(lg.name == ob.lightgroup for lg in view_layer.lightgroups) sub.operator("scene.view_layer_add_lightgroup", icon='ADD', text="").name = ob.lightgroup @@ -1640,7 +1651,7 @@ class CYCLES_WORLD_PT_settings_light_group(CyclesButtonsPanel, Panel): ) sub = row.column(align=True) - sub.active = bool(world.lightgroup) and not any(lg.name == world.lightgroup for lg in view_layer.lightgroups) + sub.enabled = bool(world.lightgroup) and not any(lg.name == world.lightgroup for lg in view_layer.lightgroups) sub.operator("scene.view_layer_add_lightgroup", icon='ADD', text="").name = world.lightgroup @@ -2269,6 +2280,7 @@ classes = ( CYCLES_PT_sampling_presets, CYCLES_PT_viewport_sampling_presets, CYCLES_PT_integrator_presets, + CYCLES_PT_performance_presets, CYCLES_RENDER_PT_sampling, CYCLES_RENDER_PT_sampling_viewport, CYCLES_RENDER_PT_sampling_viewport_denoise, diff --git a/intern/cycles/blender/camera.cpp b/intern/cycles/blender/camera.cpp index 2ab5f02a337..6926c833096 100644 --- a/intern/cycles/blender/camera.cpp +++ b/intern/cycles/blender/camera.cpp @@ -143,11 +143,20 @@ static float blender_camera_focal_distance(BL::RenderEngine &b_engine, if (!b_dof_object) return b_camera.dof().focus_distance(); + Transform dofmat = get_transform(b_dof_object.matrix_world()); + + string focus_subtarget = b_camera.dof().focus_subtarget(); + if (b_dof_object.pose() && !focus_subtarget.empty()) { + BL::PoseBone b_bone = b_dof_object.pose().bones[focus_subtarget]; + if (b_bone) { + dofmat = dofmat * get_transform(b_bone.matrix()); + } + } + /* for dof object, return distance along camera Z direction */ BL::Array<float, 16> b_ob_matrix; b_engine.camera_model_matrix(b_ob, bcam->use_spherical_stereo, b_ob_matrix); Transform obmat = transform_clear_scale(get_transform(b_ob_matrix)); - Transform dofmat = get_transform(b_dof_object.matrix_world()); float3 view_dir = normalize(transform_get_column(&obmat, 2)); float3 dof_dir = transform_get_column(&obmat, 3) - transform_get_column(&dofmat, 3); return fabsf(dot(view_dir, dof_dir)); diff --git a/intern/cycles/blender/curves.cpp b/intern/cycles/blender/curves.cpp index 10012720bd8..c4154bce022 100644 --- a/intern/cycles/blender/curves.cpp +++ b/intern/cycles/blender/curves.cpp @@ -613,8 +613,6 @@ void BlenderSync::sync_particle_hair( } } -#ifdef WITH_NEW_CURVES_TYPE - static std::optional<BL::FloatAttribute> find_curves_radius_attribute(BL::Curves b_curves) { for (BL::Attribute &b_attribute : b_curves.attributes) { @@ -632,6 +630,25 @@ static std::optional<BL::FloatAttribute> find_curves_radius_attribute(BL::Curves return std::nullopt; } +static BL::FloatVectorAttribute find_curves_position_attribute(BL::Curves b_curves) +{ + for (BL::Attribute &b_attribute : b_curves.attributes) { + if (b_attribute.name() != "position") { + continue; + } + if (b_attribute.domain() != BL::Attribute::domain_POINT) { + continue; + } + if (b_attribute.data_type() != BL::Attribute::data_type_FLOAT_VECTOR) { + continue; + } + return BL::FloatVectorAttribute{b_attribute}; + } + /* The position attribute must exist. */ + assert(false); + return BL::FloatVectorAttribute{b_curves.attributes[0]}; +} + template<typename TypeInCycles, typename GetValueAtIndex> static void fill_generic_attribute(BL::Curves &b_curves, TypeInCycles *data, @@ -795,16 +812,16 @@ static void attr_create_generic(Scene *scene, } } -static float4 hair_point_as_float4(BL::Curves b_curves, +static float4 hair_point_as_float4(BL::FloatVectorAttribute b_attr_position, std::optional<BL::FloatAttribute> b_attr_radius, const int index) { - float4 mP = float3_to_float4(get_float3(b_curves.position_data[index].vector())); + float4 mP = float3_to_float4(get_float3(b_attr_position.data[index].vector())); mP.w = b_attr_radius ? b_attr_radius->data[index].value() : 0.0f; return mP; } -static float4 interpolate_hair_points(BL::Curves b_curves, +static float4 interpolate_hair_points(BL::FloatVectorAttribute b_attr_position, std::optional<BL::FloatAttribute> b_attr_radius, const int first_point_index, const int num_points, @@ -814,8 +831,8 @@ static float4 interpolate_hair_points(BL::Curves b_curves, const int point_a = clamp((int)curve_t, 0, num_points - 1); const int point_b = min(point_a + 1, num_points - 1); const float t = curve_t - (float)point_a; - return lerp(hair_point_as_float4(b_curves, b_attr_radius, first_point_index + point_a), - hair_point_as_float4(b_curves, b_attr_radius, first_point_index + point_b), + return lerp(hair_point_as_float4(b_attr_position, b_attr_radius, first_point_index + point_a), + hair_point_as_float4(b_attr_position, b_attr_radius, first_point_index + point_b), t); } @@ -848,6 +865,7 @@ static void export_hair_curves(Scene *scene, hair->reserve_curves(num_curves, num_keys); + BL::FloatVectorAttribute b_attr_position = find_curves_position_attribute(b_curves); std::optional<BL::FloatAttribute> b_attr_radius = find_curves_radius_attribute(b_curves); /* Export curves and points. */ @@ -866,7 +884,7 @@ static void export_hair_curves(Scene *scene, /* Position and radius. */ for (int i = 0; i < num_points; i++) { - const float3 co = get_float3(b_curves.position_data[first_point_index + i].vector()); + const float3 co = get_float3(b_attr_position.data[first_point_index + i].vector()); const float radius = b_attr_radius ? b_attr_radius->data[first_point_index + i].value() : 0.005f; hair->add_curve_key(co, radius); @@ -923,6 +941,7 @@ static void export_hair_curves_motion(Hair *hair, BL::Curves b_curves, int motio int num_motion_keys = 0; int curve_index = 0; + BL::FloatVectorAttribute b_attr_position = find_curves_position_attribute(b_curves); std::optional<BL::FloatAttribute> b_attr_radius = find_curves_radius_attribute(b_curves); for (int i = 0; i < num_curves; i++) { @@ -938,7 +957,7 @@ static void export_hair_curves_motion(Hair *hair, BL::Curves b_curves, int motio int point_index = first_point_index + i; if (point_index < num_keys) { - mP[num_motion_keys] = hair_point_as_float4(b_curves, b_attr_radius, point_index); + mP[num_motion_keys] = hair_point_as_float4(b_attr_position, b_attr_radius, point_index); num_motion_keys++; if (!have_motion) { @@ -958,7 +977,7 @@ static void export_hair_curves_motion(Hair *hair, BL::Curves b_curves, int motio for (int i = 0; i < curve.num_keys; i++) { const float step = i * step_size; mP[num_motion_keys] = interpolate_hair_points( - b_curves, b_attr_radius, first_point_index, num_points, step); + b_attr_position, b_attr_radius, first_point_index, num_points, step); num_motion_keys++; } have_motion = true; @@ -990,15 +1009,6 @@ void BlenderSync::sync_hair(Hair *hair, BObjectInfo &b_ob_info, bool motion, int export_hair_curves(scene, hair, b_curves, need_motion, motion_scale); } } -#else -void BlenderSync::sync_hair(Hair *hair, BObjectInfo &b_ob_info, bool motion, int motion_step) -{ - (void)hair; - (void)b_ob_info; - (void)motion; - (void)motion_step; -} -#endif void BlenderSync::sync_hair(BL::Depsgraph b_depsgraph, BObjectInfo &b_ob_info, Hair *hair) { @@ -1010,14 +1020,11 @@ void BlenderSync::sync_hair(BL::Depsgraph b_depsgraph, BObjectInfo &b_ob_info, H new_hair.set_used_shaders(used_shaders); if (view_layer.use_hair) { -#ifdef WITH_NEW_CURVES_TYPE if (b_ob_info.object_data.is_a(&RNA_Curves)) { /* Hair object. */ sync_hair(&new_hair, b_ob_info, false); } - else -#endif - { + else { /* Particle hair. */ bool need_undeformed = new_hair.need_attribute(scene, ATTR_STD_GENERATED); BL::Mesh b_mesh = object_to_mesh( @@ -1064,15 +1071,12 @@ void BlenderSync::sync_hair_motion(BL::Depsgraph b_depsgraph, /* Export deformed coordinates. */ if (ccl::BKE_object_is_deform_modified(b_ob_info, b_scene, preview)) { -#ifdef WITH_NEW_CURVES_TYPE if (b_ob_info.object_data.is_a(&RNA_Curves)) { /* Hair object. */ sync_hair(hair, b_ob_info, true, motion_step); return; } - else -#endif - { + else { /* Particle hair. */ BL::Mesh b_mesh = object_to_mesh( b_data, b_ob_info, b_depsgraph, false, Mesh::SUBDIVISION_NONE); diff --git a/intern/cycles/blender/geometry.cpp b/intern/cycles/blender/geometry.cpp index 215860f59e6..fc03ca6e489 100644 --- a/intern/cycles/blender/geometry.cpp +++ b/intern/cycles/blender/geometry.cpp @@ -18,11 +18,7 @@ CCL_NAMESPACE_BEGIN static Geometry::Type determine_geom_type(BObjectInfo &b_ob_info, bool use_particle_hair) { -#ifdef WITH_NEW_CURVES_TYPE if (b_ob_info.object_data.is_a(&RNA_Curves) || use_particle_hair) { -#else - if (use_particle_hair) { -#endif return Geometry::HAIR; } @@ -217,11 +213,7 @@ void BlenderSync::sync_geometry_motion(BL::Depsgraph &b_depsgraph, if (progress.get_cancel()) return; -#ifdef WITH_NEW_CURVES_TYPE if (b_ob_info.object_data.is_a(&RNA_Curves) || use_particle_hair) { -#else - if (use_particle_hair) { -#endif Hair *hair = static_cast<Hair *>(geom); sync_hair_motion(b_depsgraph, b_ob_info, hair, motion_step); } diff --git a/intern/cycles/blender/pointcloud.cpp b/intern/cycles/blender/pointcloud.cpp index 0312ad87a70..b4e90859877 100644 --- a/intern/cycles/blender/pointcloud.cpp +++ b/intern/cycles/blender/pointcloud.cpp @@ -1,8 +1,10 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2011-2022 Blender Foundation */ -#include "scene/pointcloud.h" +#include <optional> + #include "scene/attribute.h" +#include "scene/pointcloud.h" #include "scene/scene.h" #include "blender/sync.h" @@ -138,6 +140,36 @@ static void copy_attributes(PointCloud *pointcloud, } } +static std::optional<BL::FloatAttribute> find_radius_attribute(BL::PointCloud b_pointcloud) +{ + for (BL::Attribute &b_attribute : b_pointcloud.attributes) { + if (b_attribute.name() != "radius") { + continue; + } + if (b_attribute.data_type() != BL::Attribute::data_type_FLOAT) { + continue; + } + return BL::FloatAttribute{b_attribute}; + } + return std::nullopt; +} + +static BL::FloatVectorAttribute find_position_attribute(BL::PointCloud b_pointcloud) +{ + for (BL::Attribute &b_attribute : b_pointcloud.attributes) { + if (b_attribute.name() != "position") { + continue; + } + if (b_attribute.data_type() != BL::Attribute::data_type_FLOAT_VECTOR) { + continue; + } + return BL::FloatVectorAttribute{b_attribute}; + } + /* The position attribute must exist. */ + assert(false); + return BL::FloatVectorAttribute{b_pointcloud.attributes[0]}; +} + static void export_pointcloud(Scene *scene, PointCloud *pointcloud, BL::PointCloud b_pointcloud, @@ -156,18 +188,18 @@ static void export_pointcloud(Scene *scene, const int num_points = b_pointcloud.points.length(); pointcloud->reserve(num_points); + BL::FloatVectorAttribute b_attr_position = find_position_attribute(b_pointcloud); + std::optional<BL::FloatAttribute> b_attr_radius = find_radius_attribute(b_pointcloud); + /* Export points. */ - BL::PointCloud::points_iterator b_point_iter; - for (b_pointcloud.points.begin(b_point_iter); b_point_iter != b_pointcloud.points.end(); - ++b_point_iter) { - BL::Point b_point = *b_point_iter; - const float3 co = get_float3(b_point.co()); - const float radius = b_point.radius(); + for (int i = 0; i < num_points; i++) { + const float3 co = get_float3(b_attr_position.data[i].vector()); + const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.0f; pointcloud->add_point(co, radius); /* Random number per point. */ if (attr_random != NULL) { - attr_random->add(hash_uint2_to_float(b_point.index(), 0)); + attr_random->add(hash_uint2_to_float(i, 0)); } } @@ -195,14 +227,15 @@ static void export_pointcloud_motion(PointCloud *pointcloud, int num_motion_points = 0; const array<float3> &pointcloud_points = pointcloud->get_points(); - BL::PointCloud::points_iterator b_point_iter; - for (b_pointcloud.points.begin(b_point_iter); b_point_iter != b_pointcloud.points.end(); - ++b_point_iter) { - BL::Point b_point = *b_point_iter; + BL::FloatVectorAttribute b_attr_position = find_position_attribute(b_pointcloud); + std::optional<BL::FloatAttribute> b_attr_radius = find_radius_attribute(b_pointcloud); + for (int i = 0; i < num_points; i++) { if (num_motion_points < num_points) { - float3 P = get_float3(b_point.co()); - P.w = b_point.radius(); + const float3 co = get_float3(b_attr_position.data[i].vector()); + const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.0f; + float3 P = co; + P.w = radius; mP[num_motion_points] = P; have_motion = have_motion || (P != pointcloud_points[num_motion_points]); num_motion_points++; diff --git a/intern/cycles/bvh/params.h b/intern/cycles/bvh/params.h index 41d851ee687..648350d03b0 100644 --- a/intern/cycles/bvh/params.h +++ b/intern/cycles/bvh/params.h @@ -129,7 +129,7 @@ class BVHParams { top_level = false; bvh_layout = BVH_LAYOUT_BVH2; - use_compact_structure = true; + use_compact_structure = false; use_unaligned_nodes = false; num_motion_curve_steps = 0; diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp index d4f0532aa5e..1e4b9baa0c0 100644 --- a/intern/cycles/device/cpu/device_impl.cpp +++ b/intern/cycles/device/cpu/device_impl.cpp @@ -197,7 +197,7 @@ void CPUDevice::const_copy_to(const char *name, void *host, size_t size) // Update scene handle (since it is different for each device on multi devices) KernelData *const data = (KernelData *)host; - data->bvh.scene = embree_scene; + data->device_bvh = embree_scene; } #endif kernel_const_copy(&kernel_globals, name, host, size); diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 340be85e853..cdb13ca0a97 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -29,6 +29,7 @@ class DeviceQueue; class Progress; class CPUKernels; class CPUKernelThreadGlobals; +class Scene; /* Device Types */ @@ -186,6 +187,11 @@ class Device { return 0; } + /* Called after kernel texture setup, and prior to integrator state setup. */ + virtual void optimize_for_scene(Scene * /*scene*/) + { + } + virtual bool is_resident(device_ptr /*key*/, Device *sub_device) { /* Memory is always resident if this is not a multi device, regardless of whether the pointer diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 4aea8d697a5..99e60d3a788 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -75,7 +75,8 @@ class MetalDevice : public Device { std::vector<id<MTLTexture>> texture_slot_map; bool use_metalrt = false; - bool use_function_specialisation = false; + MetalPipelineType kernel_specialization_level = PSO_GENERIC; + std::atomic_bool async_compile_and_load = false; virtual BVHLayoutMask get_bvh_layout_mask() const override; @@ -91,9 +92,7 @@ class MetalDevice : public Device { bool use_adaptive_compilation(); - string get_source(const uint kernel_features); - - string compile_kernel(const uint kernel_features, const char *name); + void make_source(MetalPipelineType pso_type, const uint kernel_features); virtual bool load_kernels(const uint kernel_features) override; @@ -111,7 +110,9 @@ class MetalDevice : public Device { virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override; - id<MTLLibrary> compile(string const &source); + virtual void optimize_for_scene(Scene *scene) override; + + bool compile_and_load(MetalPipelineType pso_type); /* ------------------------------------------------------------------ */ /* low-level memory management */ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 87c83242240..d1250b83d22 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -6,6 +6,8 @@ # include "device/metal/device_impl.h" # include "device/metal/device.h" +# include "scene/scene.h" + # include "util/debug.h" # include "util/md5.h" # include "util/path.h" @@ -78,6 +80,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile case METAL_GPU_APPLE: { max_threads_per_threadgroup = 512; use_metalrt = info.use_metalrt; + + /* Specialize the intersection kernels on Apple GPUs by default as these can be built very + * quickly. */ + kernel_specialization_level = PSO_SPECIALIZED_INTERSECT; break; } } @@ -90,6 +96,13 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile capture_enabled = true; } + if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) { + kernel_specialization_level = (MetalPipelineType)atoi(envstr); + } + metal_printf("kernel_specialization_level = %s\n", + kernel_type_as_string( + (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1))); + MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init]; arg_desc_params.dataType = MTLDataTypePointer; arg_desc_params.access = MTLArgumentAccessReadOnly; @@ -209,61 +222,86 @@ bool MetalDevice::use_adaptive_compilation() return DebugFlags().metal.adaptive_compile; } -string MetalDevice::get_source(const uint kernel_features) +void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features) { - string build_options; - + string global_defines; if (use_adaptive_compilation()) { - build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features); + global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n"; } if (use_metalrt) { - build_options += "-D__METALRT__ "; + global_defines += "#define __METALRT__\n"; if (motion_blur) { - build_options += "-D__METALRT_MOTION__ "; + global_defines += "#define __METALRT_MOTION__\n"; } } # ifdef WITH_CYCLES_DEBUG - build_options += "-D__KERNEL_DEBUG__ "; + global_defines += "#define __KERNEL_DEBUG__\n"; # endif switch (device_vendor) { default: break; case METAL_GPU_INTEL: - build_options += "-D__KERNEL_METAL_INTEL__ "; + global_defines += "#define __KERNEL_METAL_INTEL__\n"; break; case METAL_GPU_AMD: - build_options += "-D__KERNEL_METAL_AMD__ "; + global_defines += "#define __KERNEL_METAL_AMD__\n"; break; case METAL_GPU_APPLE: - build_options += "-D__KERNEL_METAL_APPLE__ "; + global_defines += "#define __KERNEL_METAL_APPLE__\n"; break; } - /* reformat -D defines list into compilable form */ - vector<string> components; - string_replace(build_options, "-D", ""); - string_split(components, build_options, " "); + string &source = this->source[pso_type]; + source = "\n#include \"kernel/device/metal/kernel.metal\"\n"; + source = path_source_replace_includes(source, path_get("source")); - string globalDefines; - for (const string &component : components) { - vector<string> assignments; - string_split(assignments, component, "="); - if (assignments.size() == 2) - globalDefines += string_printf( - "#define %s %s\n", assignments[0].c_str(), assignments[1].c_str()); - else - globalDefines += string_printf("#define %s\n", assignments[0].c_str()); + /* Perform any required specialization on the source. + * With Metal function constants we can generate a single variant of the kernel source which can + * be repeatedly respecialized. + */ + string baked_constants; + + /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of + * the same character length. Build a string of all active constant values which is then hashed + * in order to identify the PSO. + */ + if (pso_type != PSO_GENERIC) { + const double starttime = time_dt(); + +# define KERNEL_STRUCT_BEGIN(name, parent) \ + string_replace_same_length(source, "kernel_data." #parent ".", "kernel_data_" #parent "_"); + + /* Add constants to md5 so that 'get_best_pipeline' is able to return a suitable match. */ +# define KERNEL_STRUCT_MEMBER(parent, _type, name) \ + baked_constants += string(#parent "." #name "=") + \ + to_string(_type(launch_params.data.parent.name)) + "\n"; + +# include "kernel/data_template.h" + + /* Opt in to all of available specializations. This can be made more granular for the + * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests, + * but the overhead should be negligible as these are very quick to (re)build and aren't + * serialized to disk via MTLBinaryArchives. + */ + global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n"; + + metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0); } - string source = globalDefines + "\n#include \"kernel/device/metal/kernel.metal\"\n"; - source = path_source_replace_includes(source, path_get("source")); - - metal_printf("Global defines:\n%s\n", globalDefines.c_str()); + source = global_defines + source; + metal_printf("================\n%s================\n\%s================\n", + global_defines.c_str(), + baked_constants.c_str()); - return source; + /* Generate an MD5 from the source and include any baked constants. This is used when caching + * PSOs. */ + MD5Hash md5; + md5.append(baked_constants); + md5.append(source); + source_md5[pso_type] = md5.get_hex(); } bool MetalDevice::load_kernels(const uint _kernel_features) @@ -279,28 +317,22 @@ bool MetalDevice::load_kernels(const uint _kernel_features) * active, but may still need to be rendered without motion blur if that isn't active as well. */ motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION; - source[PSO_GENERIC] = get_source(kernel_features); - - const double starttime = time_dt(); - - mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]); - - metal_printf("Front-end compilation finished in %.1f seconds (generic)\n", - time_dt() - starttime); - - MD5Hash md5; - md5.append(source[PSO_GENERIC]); - source_md5[PSO_GENERIC] = md5.get_hex(); - - bool result = MetalDeviceKernels::load(this, false); + bool result = compile_and_load(PSO_GENERIC); reserve_local_memory(kernel_features); - return result; } -id<MTLLibrary> MetalDevice::compile(string const &source) +bool MetalDevice::compile_and_load(MetalPipelineType pso_type) { + make_source(pso_type, kernel_features); + + if (!MetalDeviceKernels::should_load_kernels(this, pso_type)) { + /* We already have a full set of matching pipelines which are cached or queued. */ + metal_printf("%s kernels already requested\n", kernel_type_as_string(pso_type)); + return true; + } + MTLCompileOptions *options = [[MTLCompileOptions alloc] init]; options.fastMathEnabled = YES; @@ -308,19 +340,30 @@ id<MTLLibrary> MetalDevice::compile(string const &source) options.languageVersion = MTLLanguageVersion2_4; } + if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) { + path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))), + source[pso_type]); + } + + const double starttime = time_dt(); + NSError *error = NULL; - id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str()) - options:options - error:&error]; + mtlLibrary[pso_type] = [mtlDevice newLibraryWithSource:@(source[pso_type].c_str()) + options:options + error:&error]; - if (!mtlLibrary) { + if (!mtlLibrary[pso_type]) { NSString *err = [error localizedDescription]; set_error(string_printf("Failed to compile library:\n%s", [err UTF8String])); } + metal_printf("Front-end compilation finished in %.1f seconds (%s)\n", + time_dt() - starttime, + kernel_type_as_string(pso_type)); + [options release]; - return mtlLibrary; + return MetalDeviceKernels::load(this, pso_type); } void MetalDevice::reserve_local_memory(const uint kernel_features) @@ -627,6 +670,58 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz return 0; } +void MetalDevice::optimize_for_scene(Scene *scene) +{ + MetalPipelineType specialization_level = kernel_specialization_level; + + if (specialization_level < PSO_SPECIALIZED_INTERSECT) { + return; + } + + /* PSO_SPECIALIZED_INTERSECT kernels are fast to specialize, so we always load them + * synchronously. */ + compile_and_load(PSO_SPECIALIZED_INTERSECT); + + if (specialization_level < PSO_SPECIALIZED_SHADE) { + return; + } + if (!scene->params.background) { + /* Don't load PSO_SPECIALIZED_SHADE kernels during viewport rendering as they are slower to + * build. */ + return; + } + + /* PSO_SPECIALIZED_SHADE kernels are slower to specialize, so we load them asynchronously, and + * only if there isn't an existing load in flight. + */ + auto specialize_shade_fn = ^() { + compile_and_load(PSO_SPECIALIZED_SHADE); + async_compile_and_load = false; + }; + + bool async_specialize_shade = true; + + /* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */ + if (getenv("CYCLES_METAL_PROFILING") != nullptr) { + async_specialize_shade = false; + } + + if (async_specialize_shade) { + if (!async_compile_and_load) { + async_compile_and_load = true; + dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), + specialize_shade_fn); + } + else { + metal_printf( + "Async PSO_SPECIALIZED_SHADE load request already in progress - dropping request\n"); + } + } + else { + specialize_shade_fn(); + } +} + void MetalDevice::const_copy_to(const char *name, void *host, size_t size) { if (strcmp(name, "data") == 0) { @@ -652,7 +747,7 @@ void MetalDevice::const_copy_to(const char *name, void *host, size_t size) /* Update data storage pointers in launch parameters. */ if (strcmp(name, "integrator_state") == 0) { /* IntegratorStateGPU is contiguous pointers */ - const size_t pointer_block_size = sizeof(IntegratorStateGPU); + const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor); update_launch_pointers( offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size); } diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h index 69b2a686ecc..11393f8b7e1 100644 --- a/intern/cycles/device/metal/kernel.h +++ b/intern/cycles/device/metal/kernel.h @@ -31,7 +31,7 @@ enum { enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM }; /* Pipeline State Object types */ -enum { +enum MetalPipelineType { /* A kernel that can be used with all scenes, supporting all features. * It is slow to compile, but only needs to be compiled once and is then * cached for future render sessions. This allows a render to get underway @@ -39,28 +39,33 @@ enum { */ PSO_GENERIC, - /* A kernel that is relatively quick to compile, but is specialized for the - * scene being rendered. It only contains the functionality and even baked in - * constants for values that means it needs to be recompiled whenever a - * dependent setting is changed. The render performance of this kernel is - * significantly faster though, and justifies the extra compile time. + /* A intersection kernel that is very quick to specialize and results in faster intersection + * kernel performance. It uses Metal function constants to replace several KernelData variables + * with fixed constants. + */ + PSO_SPECIALIZED_INTERSECT, + + /* A shading kernel that is slow to specialize, but results in faster shading kernel performance + * rendered. It uses Metal function constants to replace several KernelData variables with fixed + * constants and short-circuit all unused SVM node case handlers. */ - /* METAL_WIP: This isn't used and will require more changes to enable. */ - PSO_SPECIALISED, + PSO_SPECIALIZED_SHADE, PSO_NUM }; -const char *kernel_type_as_string(int kernel_type); +const char *kernel_type_as_string(MetalPipelineType pso_type); struct MetalKernelPipeline { void compile(); id<MTLLibrary> mtlLibrary = nil; - bool scene_specialized; + MetalPipelineType pso_type; string source_md5; + size_t usage_count = 0; + KernelData kernel_data_; bool use_metalrt; bool metalrt_hair; bool metalrt_hair_thick; @@ -75,6 +80,8 @@ struct MetalKernelPipeline { id<MTLComputePipelineState> pipeline = nil; int num_threads_per_block = 0; + bool should_use_binary_archive() const; + string error_str; API_AVAILABLE(macos(11.0)) @@ -85,7 +92,8 @@ struct MetalKernelPipeline { /* Cache of Metal kernels for each DeviceKernel. */ namespace MetalDeviceKernels { -bool load(MetalDevice *device, bool scene_specialized); +bool should_load_kernels(MetalDevice *device, MetalPipelineType pso_type); +bool load(MetalDevice *device, MetalPipelineType pso_type); const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel); } /* namespace MetalDeviceKernels */ diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index fec4cd80466..385cb412b06 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -5,6 +5,7 @@ # include "device/metal/kernel.h" # include "device/metal/device_impl.h" +# include "kernel/device/metal/function_constants.h" # include "util/md5.h" # include "util/path.h" # include "util/tbb.h" @@ -16,13 +17,15 @@ CCL_NAMESPACE_BEGIN /* limit to 2 MTLCompiler instances */ int max_mtlcompiler_threads = 2; -const char *kernel_type_as_string(int kernel_type) +const char *kernel_type_as_string(MetalPipelineType pso_type) { - switch (kernel_type) { + switch (pso_type) { case PSO_GENERIC: return "PSO_GENERIC"; - case PSO_SPECIALISED: - return "PSO_SPECIALISED"; + case PSO_SPECIALIZED_INTERSECT: + return "PSO_SPECIALIZED_INTERSECT"; + case PSO_SPECIALIZED_SHADE: + return "PSO_SPECIALIZED_SHADE"; default: assert(0); } @@ -50,7 +53,11 @@ struct ShaderCache { /* Non-blocking request for a kernel, optionally specialized to the scene being rendered by * device. */ - void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized); + void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type); + + bool should_load_kernel(DeviceKernel device_kernel, + MetalDevice *device, + MetalPipelineType pso_type); void wait_for_all(); @@ -139,31 +146,34 @@ void ShaderCache::compile_thread_func(int thread_index) } } -void ShaderCache::load_kernel(DeviceKernel device_kernel, - MetalDevice *device, - bool scene_specialized) +bool ShaderCache::should_load_kernel(DeviceKernel device_kernel, + MetalDevice *device, + MetalPipelineType pso_type) { - { - /* create compiler threads on first run */ - thread_scoped_lock lock(cache_mutex); - if (compile_threads.empty()) { - running = true; - for (int i = 0; i < max_mtlcompiler_threads; i++) { - compile_threads.push_back(std::thread([&] { compile_thread_func(i); })); - } - } + if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + /* Skip megakernel. */ + return false; } - if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { - /* skip megakernel */ - return; + if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { + if ((device->kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) { + /* Skip shade_surface_raytrace kernel if the scene doesn't require it. */ + return false; + } } - if (scene_specialized) { + if (pso_type != PSO_GENERIC) { /* Only specialize kernels where it can make an impact. */ if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { - return; + return false; + } + + /* Only specialize shading / intersection kernels as requested. */ + bool is_shade_kernel = (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); + bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE); + if (is_shade_pso != is_shade_kernel) { + return false; } } @@ -171,35 +181,45 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel, /* check whether the kernel has already been requested / cached */ thread_scoped_lock lock(cache_mutex); for (auto &pipeline : pipelines[device_kernel]) { - if (scene_specialized) { - if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) { - /* we already requested a pipeline that is specialized for this kernel data */ - metal_printf("Specialized kernel already requested (%s)\n", - device_kernel_as_string(device_kernel)); - return; - } + if (pipeline->source_md5 == device->source_md5[pso_type]) { + return false; } - else { - if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) { - /* we already requested a generic pipeline for this kernel */ - metal_printf("Generic kernel already requested (%s)\n", - device_kernel_as_string(device_kernel)); - return; - } + } + } + + return true; +} + +void ShaderCache::load_kernel(DeviceKernel device_kernel, + MetalDevice *device, + MetalPipelineType pso_type) +{ + { + /* create compiler threads on first run */ + thread_scoped_lock lock(cache_mutex); + if (compile_threads.empty()) { + running = true; + for (int i = 0; i < max_mtlcompiler_threads; i++) { + compile_threads.push_back(std::thread([&] { compile_thread_func(i); })); } } } + if (!should_load_kernel(device_kernel, device, pso_type)) { + return; + } + incomplete_requests++; PipelineRequest request; request.pipeline = new MetalKernelPipeline; - request.pipeline->scene_specialized = scene_specialized; + memcpy(&request.pipeline->kernel_data_, + &device->launch_params.data, + sizeof(request.pipeline->kernel_data_)); + request.pipeline->pso_type = pso_type; request.pipeline->mtlDevice = mtlDevice; - request.pipeline->source_md5 = - device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC]; - request.pipeline->mtlLibrary = - device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC]; + request.pipeline->source_md5 = device->source_md5[pso_type]; + request.pipeline->mtlLibrary = device->mtlLibrary[pso_type]; request.pipeline->device_kernel = device_kernel; request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup; @@ -214,7 +234,24 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel, { thread_scoped_lock lock(cache_mutex); - pipelines[device_kernel].push_back(unique_ptr<MetalKernelPipeline>(request.pipeline)); + auto &collection = pipelines[device_kernel]; + + /* Cache up to 3 kernel variants with the same pso_type, purging oldest first. */ + int max_entries_of_same_pso_type = 3; + for (int i = (int)collection.size() - 1; i >= 0; i--) { + if (collection[i]->pso_type == pso_type) { + max_entries_of_same_pso_type -= 1; + if (max_entries_of_same_pso_type == 0) { + metal_printf("Purging oldest %s:%s kernel from ShaderCache\n", + kernel_type_as_string(pso_type), + device_kernel_as_string(device_kernel)); + collection.erase(collection.begin() + i); + break; + } + } + } + + collection.push_back(unique_ptr<MetalKernelPipeline>(request.pipeline)); request_queue.push_back(request); } cond_var.notify_one(); @@ -248,8 +285,9 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M continue; } - if (pipeline->scene_specialized) { - if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) { + if (pipeline->pso_type != PSO_GENERIC) { + if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] || + pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) { best_pipeline = pipeline.get(); } } @@ -258,13 +296,65 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M } } + if (best_pipeline->usage_count == 0 && best_pipeline->pso_type != PSO_GENERIC) { + metal_printf("Swapping in %s version of %s\n", + kernel_type_as_string(best_pipeline->pso_type), + device_kernel_as_string(kernel)); + } + best_pipeline->usage_count += 1; + return best_pipeline; } -void MetalKernelPipeline::compile() +bool MetalKernelPipeline::should_use_binary_archive() const { - int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC; + if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { + if (atoi(str) != 0) { + /* Don't archive if we have opted out by env var. */ + return false; + } + } + + if (pso_type == PSO_GENERIC) { + /* Archive the generic kernels. */ + return true; + } + + if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND && + device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) { + /* Archive all shade kernels - they take a long time to compile. */ + return true; + } + + /* The remaining kernels are all fast to compile. They may get cached by the system shader cache, + * but will be quick to regenerate if not. */ + return false; +} + +static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nullptr) +{ + MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues new]; + + MTLDataType MTLDataType_int = MTLDataTypeInt; + MTLDataType MTLDataType_float = MTLDataTypeFloat; + MTLDataType MTLDataType_float4 = MTLDataTypeFloat4; + KernelData zero_data = {0}; + if (!data) { + data = &zero_data; + } +# define KERNEL_STRUCT_MEMBER(parent, _type, name) \ + [constant_values setConstantValue:&data->parent.name \ + type:MTLDataType_##_type \ + atIndex:KernelData_##parent##_##name]; + +# include "kernel/data_template.h" + + return constant_values; +} + +void MetalKernelPipeline::compile() +{ const std::string function_name = std::string("cycles_metal_") + device_kernel_as_string(device_kernel); @@ -281,6 +371,17 @@ void MetalKernelPipeline::compile() if (@available(macOS 11.0, *)) { MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor]; func_desc.name = entryPoint; + + if (pso_type == PSO_SPECIALIZED_SHADE) { + func_desc.constantValues = GetConstantValues(&kernel_data_); + } + else if (pso_type == PSO_SPECIALIZED_INTERSECT) { + func_desc.constantValues = GetConstantValues(&kernel_data_); + } + else { + func_desc.constantValues = GetConstantValues(); + } + function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error]; } @@ -427,10 +528,7 @@ void MetalKernelPipeline::compile() MTLPipelineOption pipelineOptions = MTLPipelineOptionNone; - bool use_binary_archive = true; - if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { - use_binary_archive = (atoi(str) == 0); - } + bool use_binary_archive = should_use_binary_archive(); id<MTLBinaryArchive> archive = nil; string metalbin_path; @@ -608,19 +706,32 @@ void MetalKernelPipeline::compile() } } -bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized) +bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type) { + const double starttime = time_dt(); auto shader_cache = get_shader_cache(device->mtlDevice); for (int i = 0; i < DEVICE_KERNEL_NUM; i++) { - shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized); + shader_cache->load_kernel((DeviceKernel)i, device, pso_type); } - if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) { - shader_cache->wait_for_all(); - } + shader_cache->wait_for_all(); + metal_printf("Back-end compilation finished in %.1f seconds (%s)\n", + time_dt() - starttime, + kernel_type_as_string(pso_type)); return true; } +bool MetalDeviceKernels::should_load_kernels(MetalDevice *device, MetalPipelineType pso_type) +{ + auto shader_cache = get_shader_cache(device->mtlDevice); + for (int i = 0; i < DEVICE_KERNEL_NUM; i++) { + if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) { + return true; + } + } + return false; +} + const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device, DeviceKernel kernel) { diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index b0bd487c86d..fc32740f3e1 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -24,6 +24,7 @@ class MetalDeviceQueue : public DeviceQueue { virtual int num_concurrent_states(const size_t) const override; virtual int num_concurrent_busy_states() const override; + virtual int num_sort_partition_elements() const override; virtual void init_execution() override; diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 03e60b6bb6e..5ac63a16c61 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -293,6 +293,11 @@ int MetalDeviceQueue::num_concurrent_busy_states() const return result; } +int MetalDeviceQueue::num_sort_partition_elements() const +{ + return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice); +} + void MetalDeviceQueue::init_execution() { /* Synchronize all textures and memory copies before executing task. */ @@ -359,7 +364,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, /* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */ /* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */ size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) + - sizeof(IntegratorStateGPU); + offsetof(IntegratorStateGPU, sort_partition_divisor); size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset; memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset, (uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset, @@ -416,7 +421,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, /* this relies on IntegratorStateGPU layout being contiguous device_ptrs */ const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) + - sizeof(IntegratorStateGPU); + offsetof(IntegratorStateGPU, sort_partition_divisor); for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) { int pointer_index = int(offset / sizeof(device_ptr)); MetalDevice::MetalMem *mmem = *( diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h index fd32d8a260f..a988d01d361 100644 --- a/intern/cycles/device/metal/util.h +++ b/intern/cycles/device/metal/util.h @@ -37,6 +37,7 @@ struct MetalInfo { static int get_apple_gpu_core_count(id<MTLDevice> device); static MetalGPUVendor get_device_vendor(id<MTLDevice> device); static AppleGPUArchitecture get_apple_gpu_architecture(id<MTLDevice> device); + static int optimal_sort_partition_elements(id<MTLDevice> device); static string get_device_name(id<MTLDevice> device); }; diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm index a7a5b596b8f..65c67c400fe 100644 --- a/intern/cycles/device/metal/util.mm +++ b/intern/cycles/device/metal/util.mm @@ -72,6 +72,21 @@ MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device) return METAL_GPU_UNKNOWN; } +int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device) +{ + if (auto str = getenv("CYCLES_METAL_SORT_PARTITION_ELEMENTS")) { + return atoi(str); + } + + /* On M1 and M2 GPUs, we see better cache utilization if we partition the active indices before + * sorting each partition by material. Partitioning into chunks of 65536 elements results in an + * overall render time speedup of up to 15%. */ + if (get_device_vendor(device) == METAL_GPU_APPLE) { + return 65536; + } + return 0; +} + vector<id<MTLDevice>> const &MetalInfo::get_usable_devices() { static vector<id<MTLDevice>> usable_devices; diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp index e7dcc29a2da..11c0d1bf8a0 100644 --- a/intern/cycles/device/optix/device_impl.cpp +++ b/intern/cycles/device/optix/device_impl.cpp @@ -2047,7 +2047,7 @@ void OptiXDevice::const_copy_to(const char *name, void *host, size_t size) /* Update traversable handle (since it is different for each device on multi devices). */ KernelData *const data = (KernelData *)host; - *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle; + *(OptixTraversableHandle *)&data->device_bvh = tlas_handle; update_launch_params(offsetof(KernelParamsOptiX, data), host, size); return; diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h index 14a5db3a204..808431af401 100644 --- a/intern/cycles/device/queue.h +++ b/intern/cycles/device/queue.h @@ -105,6 +105,13 @@ class DeviceQueue { * value. */ virtual int num_concurrent_busy_states() const = 0; + /* Number of elements in a partition of sorted shaders, that improves memory locality of + * integrator state fetch at the cost of decreased coherence for shader kernel execution. */ + virtual int num_sort_partition_elements() const + { + return 65536; + } + /* Initialize execution of kernels on this queue. * * Will, for example, load all data required by the kernels from Device to global or path state. diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp index 6912bf928cd..ed278821b46 100644 --- a/intern/cycles/integrator/path_trace.cpp +++ b/intern/cycles/integrator/path_trace.cpp @@ -373,7 +373,7 @@ void PathTrace::path_trace(RenderWork &render_work) work_balance_infos_[i].time_spent += work_time; work_balance_infos_[i].occupancy = statistics.occupancy; - VLOG_WORK << "Rendered " << num_samples << " samples in " << work_time << " seconds (" + VLOG_INFO << "Rendered " << num_samples << " samples in " << work_time << " seconds (" << work_time / num_samples << " seconds per sample), occupancy: " << statistics.occupancy; }); diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index e262c252ce3..fa313f6460a 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -181,27 +181,45 @@ void PathTraceWorkGPU::alloc_integrator_queue() void PathTraceWorkGPU::alloc_integrator_sorting() { + /* Compute sort partitions, to balance between memory locality and coherence. + * Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a + * more sophisticated heuristic we simply disable sort partitioning if the shader count is high. + */ + num_sort_partitions_ = 1; + if (device_scene_->data.max_shaders < 300) { + const int num_elements = queue_->num_sort_partition_elements(); + if (num_elements) { + num_sort_partitions_ = max(max_num_paths_ / num_elements, 1); + } + } + + integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_, + num_sort_partitions_); + /* Allocate arrays for shader sorting. */ - const int max_shaders = device_scene_->data.max_shaders; - if (integrator_shader_sort_counter_.size() < max_shaders) { - integrator_shader_sort_counter_.alloc(max_shaders); + const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_; + if (integrator_shader_sort_counter_.size() < sort_buckets) { + integrator_shader_sort_counter_.alloc(sort_buckets); integrator_shader_sort_counter_.zero_to_device(); + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = + (int *)integrator_shader_sort_counter_.device_pointer; - integrator_shader_raytrace_sort_counter_.alloc(max_shaders); - integrator_shader_raytrace_sort_counter_.zero_to_device(); + if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + integrator_shader_raytrace_sort_counter_.alloc(sort_buckets); + integrator_shader_raytrace_sort_counter_.zero_to_device(); + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] = + (int *)integrator_shader_raytrace_sort_counter_.device_pointer; + } - integrator_shader_mnee_sort_counter_.alloc(max_shaders); - integrator_shader_mnee_sort_counter_.zero_to_device(); + if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) { + integrator_shader_mnee_sort_counter_.alloc(sort_buckets); + integrator_shader_mnee_sort_counter_.zero_to_device(); + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] = + (int *)integrator_shader_mnee_sort_counter_.device_pointer; + } - integrator_shader_sort_prefix_sum_.alloc(max_shaders); + integrator_shader_sort_prefix_sum_.alloc(sort_buckets); integrator_shader_sort_prefix_sum_.zero_to_device(); - - integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = - (int *)integrator_shader_sort_counter_.device_pointer; - integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] = - (int *)integrator_shader_raytrace_sort_counter_.device_pointer; - integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] = - (int *)integrator_shader_mnee_sort_counter_.device_pointer; } } @@ -333,8 +351,12 @@ void PathTraceWorkGPU::enqueue_reset() queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args); queue_->zero_to_device(integrator_queue_counter_); queue_->zero_to_device(integrator_shader_sort_counter_); - queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); - queue_->zero_to_device(integrator_shader_mnee_sort_counter_); + if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); + } + if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) { + queue_->zero_to_device(integrator_shader_mnee_sort_counter_); + } /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the * counter on the host side because `zero_to_device()` is not doing it. */ @@ -486,9 +508,9 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, /* Compute prefix sum of number of active paths with each shader. */ { const int work_size = 1; - int max_shaders = device_scene_->data.max_shaders; + int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_; - DeviceKernelArguments args(&d_counter, &d_prefix_sum, &max_shaders); + DeviceKernelArguments args(&d_counter, &d_prefix_sum, &sort_buckets); queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args); } diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h index 4c10a221a30..a805258d1b5 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.h +++ b/intern/cycles/integrator/path_trace_work_gpu.h @@ -156,6 +156,9 @@ class PathTraceWorkGPU : public PathTraceWork { bool interop_use_checked_ = false; bool interop_use_ = false; + /* Number of partitions to sort state indices into prior to material sort. */ + int num_sort_partitions_; + /* Maximum number of concurrent integrator states. */ int max_num_paths_; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index b893ff6ef24..21a78722c0d 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -79,6 +79,7 @@ set(SRC_KERNEL_DEVICE_METAL_HEADERS device/metal/compat.h device/metal/context_begin.h device/metal/context_end.h + device/metal/function_constants.h device/metal/globals.h ) @@ -154,6 +155,7 @@ set(SRC_KERNEL_SVM_HEADERS svm/math_util.h svm/mix.h svm/musgrave.h + svm/node_types_template.h svm/noise.h svm/noisetex.h svm/normal.h @@ -282,6 +284,7 @@ set(SRC_KERNEL_UTIL_HEADERS set(SRC_KERNEL_TYPES_HEADERS data_arrays.h + data_template.h tables.h types.h ) @@ -844,10 +847,9 @@ if(WITH_CYCLES_DEVICE_ONEAPI) else() list(APPEND sycl_compiler_flags -fPIC) - # avoid getting __FAST_MATH__ to be defined for the graphics compiler on CentOS 7 until the compile-time issue it triggers gets fixed. - if(WITH_CYCLES_ONEAPI_BINARIES) - list(APPEND sycl_compiler_flags -fhonor-nans) - endif() + # We avoid getting __FAST_MATH__ to be defined when building on CentOS 7 until the compilation crash + # it triggers at either AoT or JIT stages gets fixed. + list(APPEND sycl_compiler_flags -fhonor-nans) # add $ORIGIN to cycles_kernel_oneapi.so rpath so libsycl.so and # libpi_level_zero.so can be placed next to it and get found. diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index a1d0e307170..9972de86c47 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -172,11 +172,11 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg, ray_flags |= OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT; } - optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0, + optixTrace(scene_intersect_valid(ray) ? kernel_data.device_bvh : 0, ray->P, ray->D, - 0.0f, - ray->t, + ray->tmin, + ray->tmax, ray->time, ray_mask, ray_flags, @@ -203,28 +203,28 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg, #elif defined(__METALRT__) if (!scene_intersect_valid(ray)) { - isect->t = ray->t; + isect->t = ray->tmax; isect->type = PRIMITIVE_NONE; return false; } # if defined(__KERNEL_DEBUG__) if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) { - isect->t = ray->t; + isect->t = ray->tmax; isect->type = PRIMITIVE_NONE; kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer"); return false; } if (is_null_intersection_function_table(metal_ancillaries->ift_default)) { - isect->t = ray->t; + isect->t = ray->tmax; isect->type = PRIMITIVE_NONE; kernel_assert(!"Invalid ift_default"); return false; } # endif - metal::raytracing::ray r(ray->P, ray->D, 0.0f, ray->t); + metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax); metalrt_intersector_type metalrt_intersect; if (!kernel_data.bvh.have_curves) { @@ -263,7 +263,7 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg, # endif if (intersection.type == intersection_type::none) { - isect->t = ray->t; + isect->t = ray->tmax; isect->type = PRIMITIVE_NONE; return false; @@ -295,14 +295,14 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg, } # ifdef __EMBREE__ - if (kernel_data.bvh.scene) { - isect->t = ray->t; + if (kernel_data.device_bvh) { + isect->t = ray->tmax; CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR); IntersectContext rtc_ctx(&ctx); RTCRayHit ray_hit; ctx.ray = ray; kernel_embree_setup_rayhit(*ray, ray_hit, visibility); - rtcIntersect1(kernel_data.bvh.scene, &rtc_ctx.context, &ray_hit); + rtcIntersect1(kernel_data.device_bvh, &rtc_ctx.context, &ray_hit); if (ray_hit.hit.geomID != RTC_INVALID_GEOMETRY_ID && ray_hit.hit.primID != RTC_INVALID_GEOMETRY_ID) { kernel_embree_convert_hit(kg, &ray_hit.ray, &ray_hit.hit, isect); @@ -357,11 +357,11 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg, if (local_isect) { local_isect->num_hits = 0; /* Initialize hit count to zero. */ } - optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0, + optixTrace(scene_intersect_valid(ray) ? kernel_data.device_bvh : 0, ray->P, ray->D, - 0.0f, - ray->t, + ray->tmin, + ray->tmax, ray->time, 0xFF, /* Need to always call into __anyhit__kernel_optix_local_hit. */ @@ -405,7 +405,7 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg, } # endif - metal::raytracing::ray r(ray->P, ray->D, 0.0f, ray->t); + metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax); metalrt_intersector_type metalrt_intersect; metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque); @@ -451,7 +451,7 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg, } # ifdef __EMBREE__ - if (kernel_data.bvh.scene) { + if (kernel_data.device_bvh) { const bool has_bvh = !(kernel_data_fetch(object_flag, local_object) & SD_OBJECT_TRANSFORM_APPLIED); CCLIntersectContext ctx( @@ -470,13 +470,13 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg, /* If this object has its own BVH, use it. */ if (has_bvh) { - RTCGeometry geom = rtcGetGeometry(kernel_data.bvh.scene, local_object * 2); + RTCGeometry geom = rtcGetGeometry(kernel_data.device_bvh, local_object * 2); if (geom) { float3 P = ray->P; float3 dir = ray->D; float3 idir = ray->D; Transform ob_itfm; - rtc_ray.tfar = ray->t * + rtc_ray.tfar = ray->tmax * bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm); /* bvh_instance_motion_push() returns the inverse transform but * it's not needed here. */ @@ -496,7 +496,7 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg, } } else { - rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray); + rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray); } /* rtcOccluded1 sets tfar to -inf if a hit was found. */ @@ -539,11 +539,11 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg, ray_mask = 0xFF; } - optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0, + optixTrace(scene_intersect_valid(ray) ? kernel_data.device_bvh : 0, ray->P, ray->D, - 0.0f, - ray->t, + ray->tmin, + ray->tmax, ray->time, ray_mask, /* Need to always call into __anyhit__kernel_optix_shadow_all_hit. */ @@ -582,7 +582,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg, } # endif - metal::raytracing::ray r(ray->P, ray->D, 0.0f, ray->t); + metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax); metalrt_intersector_type metalrt_intersect; metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque); @@ -633,7 +633,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg, } # ifdef __EMBREE__ - if (kernel_data.bvh.scene) { + if (kernel_data.device_bvh) { CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL); Intersection *isect_array = (Intersection *)state->shadow_isect; ctx.isect_s = isect_array; @@ -642,7 +642,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg, IntersectContext rtc_ctx(&ctx); RTCRay rtc_ray; kernel_embree_setup_ray(*ray, rtc_ray, visibility); - rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray); + rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray); *num_recorded_hits = ctx.num_recorded_hits; *throughput = ctx.throughput; @@ -698,11 +698,11 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg, ray_mask = 0xFF; } - optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0, + optixTrace(scene_intersect_valid(ray) ? kernel_data.device_bvh : 0, ray->P, ray->D, - 0.0f, - ray->t, + ray->tmin, + ray->tmax, ray->time, ray_mask, /* Need to always call into __anyhit__kernel_optix_volume_test. */ @@ -744,7 +744,7 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg, } # endif - metal::raytracing::ray r(ray->P, ray->D, 0.0f, ray->t); + metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax); metalrt_intersector_type metalrt_intersect; metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque); @@ -825,7 +825,7 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals kg, } # ifdef __EMBREE__ - if (kernel_data.bvh.scene) { + if (kernel_data.device_bvh) { CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL); ctx.isect_s = isect; ctx.max_hits = max_hits; @@ -834,7 +834,7 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals kg, IntersectContext rtc_ctx(&ctx); RTCRay rtc_ray; kernel_embree_setup_ray(*ray, rtc_ray, visibility); - rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray); + rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray); return ctx.num_hits; } # endif /* __EMBREE__ */ diff --git a/intern/cycles/kernel/bvh/embree.h b/intern/cycles/kernel/bvh/embree.h index 1c6b9bc1e62..fecbccac2f8 100644 --- a/intern/cycles/kernel/bvh/embree.h +++ b/intern/cycles/kernel/bvh/embree.h @@ -83,8 +83,8 @@ ccl_device_inline void kernel_embree_setup_ray(const Ray &ray, rtc_ray.dir_x = ray.D.x; rtc_ray.dir_y = ray.D.y; rtc_ray.dir_z = ray.D.z; - rtc_ray.tnear = 0.0f; - rtc_ray.tfar = ray.t; + rtc_ray.tnear = ray.tmin; + rtc_ray.tfar = ray.tmax; rtc_ray.time = ray.time; rtc_ray.mask = visibility; } @@ -107,7 +107,7 @@ ccl_device_inline bool kernel_embree_is_self_intersection(const KernelGlobals kg const int oID = hit->instID[0] / 2; if ((ray->self.object == oID) || (ray->self.light_object == oID)) { RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData( - rtcGetGeometry(kernel_data.bvh.scene, hit->instID[0])); + rtcGetGeometry(kernel_data.device_bvh, hit->instID[0])); const int pID = hit->primID + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)); status = intersection_skip_self_shadow(ray->self, oID, pID); @@ -117,7 +117,7 @@ ccl_device_inline bool kernel_embree_is_self_intersection(const KernelGlobals kg const int oID = hit->geomID / 2; if ((ray->self.object == oID) || (ray->self.light_object == oID)) { const int pID = hit->primID + (intptr_t)rtcGetGeometryUserData( - rtcGetGeometry(kernel_data.bvh.scene, hit->geomID)); + rtcGetGeometry(kernel_data.device_bvh, hit->geomID)); status = intersection_skip_self_shadow(ray->self, oID, pID); } } @@ -133,14 +133,14 @@ ccl_device_inline void kernel_embree_convert_hit(KernelGlobals kg, isect->t = ray->tfar; if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) { RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData( - rtcGetGeometry(kernel_data.bvh.scene, hit->instID[0])); + rtcGetGeometry(kernel_data.device_bvh, hit->instID[0])); isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)); isect->object = hit->instID[0] / 2; } else { isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData( - rtcGetGeometry(kernel_data.bvh.scene, hit->geomID)); + rtcGetGeometry(kernel_data.device_bvh, hit->geomID)); isect->object = hit->geomID / 2; } @@ -166,7 +166,7 @@ ccl_device_inline void kernel_embree_convert_sss_hit( isect->v = hit->u; isect->t = ray->tfar; RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData( - rtcGetGeometry(kernel_data.bvh.scene, object * 2)); + rtcGetGeometry(kernel_data.device_bvh, object * 2)); isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)); isect->object = object; diff --git a/intern/cycles/kernel/bvh/local.h b/intern/cycles/kernel/bvh/local.h index 3b6b30ea93d..017a241ef4a 100644 --- a/intern/cycles/kernel/bvh/local.h +++ b/intern/cycles/kernel/bvh/local.h @@ -47,8 +47,9 @@ ccl_device_inline float3 P = ray->P; float3 dir = bvh_clamp_direction(ray->D); float3 idir = bvh_inverse_direction(dir); + float tmin = ray->tmin; int object = OBJECT_NONE; - float isect_t = ray->t; + float isect_t = ray->tmax; if (local_isect != NULL) { local_isect->num_hits = 0; @@ -59,10 +60,13 @@ ccl_device_inline if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { #if BVH_FEATURE(BVH_MOTION) Transform ob_itfm; - isect_t *= bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm); + const float t_world_to_instance = bvh_instance_motion_push( + kg, local_object, ray, &P, &dir, &idir, &ob_itfm); #else - isect_t *= bvh_instance_push(kg, local_object, ray, &P, &dir, &idir); + const float t_world_to_instance = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir); #endif + isect_t *= t_world_to_instance; + tmin *= t_world_to_instance; object = local_object; } @@ -81,6 +85,7 @@ ccl_device_inline dir, #endif idir, + tmin, isect_t, node_addr, PATH_RAY_ALL_VISIBILITY, @@ -155,6 +160,7 @@ ccl_device_inline local_object, prim, prim_addr, + tmin, isect_t, lcg_state, max_hits)) { @@ -191,6 +197,7 @@ ccl_device_inline local_object, prim, prim_addr, + tmin, isect_t, lcg_state, max_hits)) { diff --git a/intern/cycles/kernel/bvh/nodes.h b/intern/cycles/kernel/bvh/nodes.h index c19dea9223b..e02841fad16 100644 --- a/intern/cycles/kernel/bvh/nodes.h +++ b/intern/cycles/kernel/bvh/nodes.h @@ -18,7 +18,8 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals kg ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals kg, const float3 P, const float3 idir, - const float t, + const float tmin, + const float tmax, const int node_addr, const uint visibility, float dist[2]) @@ -39,8 +40,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals kg, float c0hiy = (node1.z - P.y) * idir.y; float c0loz = (node2.x - P.z) * idir.z; float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); - float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); + float c0min = max4(tmin, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); + float c0max = min4(tmax, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); float c1lox = (node0.y - P.x) * idir.x; float c1hix = (node0.w - P.x) * idir.x; @@ -48,8 +49,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals kg, float c1hiy = (node1.w - P.y) * idir.y; float c1loz = (node2.y - P.z) * idir.z; float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); - float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); + float c1min = max4(tmin, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); + float c1max = min4(tmax, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); dist[0] = c0min; dist[1] = c1min; @@ -66,7 +67,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals kg, ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals kg, const float3 P, const float3 dir, - const float t, + const float tmin, + const float tmax, int node_addr, int child, float dist[2]) @@ -83,8 +85,8 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals kg, const float far_x = max(lower_xyz.x, upper_xyz.x); const float far_y = max(lower_xyz.y, upper_xyz.y); const float far_z = max(lower_xyz.z, upper_xyz.z); - const float tnear = max4(0.0f, near_x, near_y, near_z); - const float tfar = min4(t, far_x, far_y, far_z); + const float tnear = max4(tmin, near_x, near_y, near_z); + const float tfar = min4(tmax, far_x, far_y, far_z); *dist = tnear; return tnear <= tfar; } @@ -93,7 +95,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals kg, const float3 P, const float3 dir, const float3 idir, - const float t, + const float tmin, + const float tmax, const int node_addr, const uint visibility, float dist[2]) @@ -102,7 +105,7 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals kg, #ifdef __VISIBILITY_FLAG__ float4 cnodes = kernel_data_fetch(bvh_nodes, node_addr + 0); #endif - if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) { + if (bvh_unaligned_node_intersect_child(kg, P, dir, tmin, tmax, node_addr, 0, &dist[0])) { #ifdef __VISIBILITY_FLAG__ if ((__float_as_uint(cnodes.x) & visibility)) #endif @@ -110,7 +113,7 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals kg, mask |= 1; } } - if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) { + if (bvh_unaligned_node_intersect_child(kg, P, dir, tmin, tmax, node_addr, 1, &dist[1])) { #ifdef __VISIBILITY_FLAG__ if ((__float_as_uint(cnodes.y) & visibility)) #endif @@ -125,16 +128,17 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals kg, const float3 P, const float3 dir, const float3 idir, - const float t, + const float tmin, + const float tmax, const int node_addr, const uint visibility, float dist[2]) { float4 node = kernel_data_fetch(bvh_nodes, node_addr); if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return bvh_unaligned_node_intersect(kg, P, dir, idir, t, node_addr, visibility, dist); + return bvh_unaligned_node_intersect(kg, P, dir, idir, tmin, tmax, node_addr, visibility, dist); } else { - return bvh_aligned_node_intersect(kg, P, idir, t, node_addr, visibility, dist); + return bvh_aligned_node_intersect(kg, P, idir, tmin, tmax, node_addr, visibility, dist); } } diff --git a/intern/cycles/kernel/bvh/shadow_all.h b/intern/cycles/kernel/bvh/shadow_all.h index e86fe867eac..db3c91569aa 100644 --- a/intern/cycles/kernel/bvh/shadow_all.h +++ b/intern/cycles/kernel/bvh/shadow_all.h @@ -49,6 +49,7 @@ ccl_device_inline float3 P = ray->P; float3 dir = bvh_clamp_direction(ray->D); float3 idir = bvh_inverse_direction(dir); + float tmin = ray->tmin; int object = OBJECT_NONE; uint num_hits = 0; @@ -59,12 +60,12 @@ ccl_device_inline /* Max distance in world space. May be dynamically reduced when max number of * recorded hits is exceeded and we no longer need to find hits beyond the max * distance found. */ - float t_max_world = ray->t; + float t_max_world = ray->tmax; /* Current maximum distance to the intersection. * Is calculated as a ray length, transformed to an object space when entering * instance node. */ - float t_max_current = ray->t; + float t_max_current = ray->tmax; /* Conversion from world to local space for the current instance if any, 1.0 * otherwise. */ @@ -88,6 +89,7 @@ ccl_device_inline dir, #endif idir, + tmin, t_max_current, node_addr, visibility, @@ -156,8 +158,16 @@ ccl_device_inline switch (type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { - hit = triangle_intersect( - kg, &isect, P, dir, t_max_current, visibility, prim_object, prim, prim_addr); + hit = triangle_intersect(kg, + &isect, + P, + dir, + tmin, + t_max_current, + visibility, + prim_object, + prim, + prim_addr); break; } #if BVH_FEATURE(BVH_MOTION) @@ -166,6 +176,7 @@ ccl_device_inline &isect, P, dir, + tmin, t_max_current, ray->time, visibility, @@ -189,8 +200,16 @@ ccl_device_inline } const int curve_type = kernel_data_fetch(prim_type, prim_addr); - hit = curve_intersect( - kg, &isect, P, dir, t_max_current, prim_object, prim, ray->time, curve_type); + hit = curve_intersect(kg, + &isect, + P, + dir, + tmin, + t_max_current, + prim_object, + prim, + ray->time, + curve_type); break; } @@ -207,8 +226,16 @@ ccl_device_inline } const int point_type = kernel_data_fetch(prim_type, prim_addr); - hit = point_intersect( - kg, &isect, P, dir, t_max_current, prim_object, prim, ray->time, point_type); + hit = point_intersect(kg, + &isect, + P, + dir, + tmin, + t_max_current, + prim_object, + prim, + ray->time, + point_type); break; } #endif /* BVH_FEATURE(BVH_POINTCLOUD) */ @@ -302,6 +329,7 @@ ccl_device_inline /* Convert intersection to object space. */ t_max_current *= t_world_to_instance; + tmin *= t_world_to_instance; ++stack_ptr; kernel_assert(stack_ptr < BVH_STACK_SIZE); @@ -323,7 +351,8 @@ ccl_device_inline #endif /* Restore world space ray length. */ - t_max_current = ray->t; + tmin = ray->tmin; + t_max_current = ray->tmax; object = OBJECT_NONE; t_world_to_instance = 1.0f; diff --git a/intern/cycles/kernel/bvh/traversal.h b/intern/cycles/kernel/bvh/traversal.h index 784fbf4fd11..0ff38bf02de 100644 --- a/intern/cycles/kernel/bvh/traversal.h +++ b/intern/cycles/kernel/bvh/traversal.h @@ -43,13 +43,14 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg, float3 P = ray->P; float3 dir = bvh_clamp_direction(ray->D); float3 idir = bvh_inverse_direction(dir); + float tmin = ray->tmin; int object = OBJECT_NONE; #if BVH_FEATURE(BVH_MOTION) Transform ob_itfm; #endif - isect->t = ray->t; + isect->t = ray->tmax; isect->u = 0.0f; isect->v = 0.0f; isect->prim = PRIM_NONE; @@ -71,6 +72,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg, dir, #endif idir, + tmin, isect->t, node_addr, visibility, @@ -133,8 +135,16 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg, switch (type & PRIMITIVE_ALL) { case PRIMITIVE_TRIANGLE: { - if (triangle_intersect( - kg, isect, P, dir, isect->t, visibility, prim_object, prim, prim_addr)) { + if (triangle_intersect(kg, + isect, + P, + dir, + tmin, + isect->t, + visibility, + prim_object, + prim, + prim_addr)) { /* shadow ray early termination */ if (visibility & PATH_RAY_SHADOW_OPAQUE) return true; @@ -147,6 +157,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg, isect, P, dir, + tmin, isect->t, ray->time, visibility, @@ -174,7 +185,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg, const int curve_type = kernel_data_fetch(prim_type, prim_addr); const bool hit = curve_intersect( - kg, isect, P, dir, isect->t, prim_object, prim, ray->time, curve_type); + kg, isect, P, dir, tmin, isect->t, prim_object, prim, ray->time, curve_type); if (hit) { /* shadow ray early termination */ if (visibility & PATH_RAY_SHADOW_OPAQUE) @@ -195,7 +206,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg, const int point_type = kernel_data_fetch(prim_type, prim_addr); const bool hit = point_intersect( - kg, isect, P, dir, isect->t, prim_object, prim, ray->time, point_type); + kg, isect, P, dir, tmin, isect->t, prim_object, prim, ray->time, point_type); if (hit) { /* shadow ray early termination */ if (visibility & PATH_RAY_SHADOW_OPAQUE) @@ -212,11 +223,15 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg, object = kernel_data_fetch(prim_object, -prim_addr - 1); #if BVH_FEATURE(BVH_MOTION) - isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm); + const float t_world_to_instance = bvh_instance_motion_push( + kg, object, ray, &P, &dir, &idir, &ob_itfm); #else - isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir); + const float t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir); #endif + isect->t *= t_world_to_instance; + tmin *= t_world_to_instance; + ++stack_ptr; kernel_assert(stack_ptr < BVH_STACK_SIZE); traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; @@ -235,6 +250,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg, #else isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); #endif + tmin = ray->tmin; object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr]; diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h index 572e023db25..1795ae4c790 100644 --- a/intern/cycles/kernel/bvh/util.h +++ b/intern/cycles/kernel/bvh/util.h @@ -5,6 +5,19 @@ CCL_NAMESPACE_BEGIN +/* Offset intersection distance by the smallest possible amount, to skip + * intersections at this distance. This works in cases where the ray start + * position is unchanged and only tmin is updated, since for self + * intersection we'll be comparing against the exact same distances. */ +ccl_device_forceinline float intersection_t_offset(const float t) +{ + /* This is a simplified version of nextafterf(t, FLT_MAX), only dealing with + * non-negative and finite t. */ + kernel_assert(t >= 0.0f && isfinite_safe(t)); + const uint32_t bits = (t == 0.0f) ? 1 : __float_as_uint(t) + 1; + return __uint_as_float(bits); +} + #if defined(__KERNEL_CPU__) ccl_device int intersections_compare(const void *a, const void *b) { diff --git a/intern/cycles/kernel/bvh/volume.h b/intern/cycles/kernel/bvh/volume.h index 9715712a8f2..bd4e508ecac 100644 --- a/intern/cycles/kernel/bvh/volume.h +++ b/intern/cycles/kernel/bvh/volume.h @@ -46,13 +46,14 @@ ccl_device_inline float3 P = ray->P; float3 dir = bvh_clamp_direction(ray->D); float3 idir = bvh_inverse_direction(dir); + float tmin = ray->tmin; int object = OBJECT_NONE; #if BVH_FEATURE(BVH_MOTION) Transform ob_itfm; #endif - isect->t = ray->t; + isect->t = ray->tmax; isect->u = 0.0f; isect->v = 0.0f; isect->prim = PRIM_NONE; @@ -73,6 +74,7 @@ ccl_device_inline dir, #endif idir, + tmin, isect->t, node_addr, visibility, @@ -140,7 +142,7 @@ ccl_device_inline continue; } triangle_intersect( - kg, isect, P, dir, isect->t, visibility, prim_object, prim, prim_addr); + kg, isect, P, dir, tmin, isect->t, visibility, prim_object, prim, prim_addr); } break; } @@ -165,6 +167,7 @@ ccl_device_inline isect, P, dir, + tmin, isect->t, ray->time, visibility, @@ -186,11 +189,15 @@ ccl_device_inline int object_flag = kernel_data_fetch(object_flag, object); if (object_flag & SD_OBJECT_HAS_VOLUME) { #if BVH_FEATURE(BVH_MOTION) - isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm); + const float t_world_to_instance = bvh_instance_motion_push( + kg, object, ray, &P, &dir, &idir, &ob_itfm); #else - isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir); + const float t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir); #endif + isect->t *= t_world_to_instance; + tmin *= t_world_to_instance; + ++stack_ptr; kernel_assert(stack_ptr < BVH_STACK_SIZE); traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; @@ -217,6 +224,8 @@ ccl_device_inline isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); #endif + tmin = ray->tmin; + object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr]; --stack_ptr; diff --git a/intern/cycles/kernel/bvh/volume_all.h b/intern/cycles/kernel/bvh/volume_all.h index d06ea8fe557..c6eeb07a14d 100644 --- a/intern/cycles/kernel/bvh/volume_all.h +++ b/intern/cycles/kernel/bvh/volume_all.h @@ -44,12 +44,12 @@ ccl_device_inline int node_addr = kernel_data.bvh.root; /* ray parameters in registers */ - const float tmax = ray->t; float3 P = ray->P; float3 dir = bvh_clamp_direction(ray->D); float3 idir = bvh_inverse_direction(dir); + float tmin = ray->tmin; int object = OBJECT_NONE; - float isect_t = tmax; + float isect_t = ray->tmax; #if BVH_FEATURE(BVH_MOTION) Transform ob_itfm; @@ -58,7 +58,7 @@ ccl_device_inline int num_hits_in_instance = 0; uint num_hits = 0; - isect_array->t = tmax; + isect_array->t = ray->tmax; /* traversal loop */ do { @@ -75,6 +75,7 @@ ccl_device_inline dir, #endif idir, + tmin, isect_t, node_addr, visibility, @@ -141,8 +142,16 @@ ccl_device_inline if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { continue; } - hit = triangle_intersect( - kg, isect_array, P, dir, isect_t, visibility, prim_object, prim, prim_addr); + hit = triangle_intersect(kg, + isect_array, + P, + dir, + tmin, + isect_t, + visibility, + prim_object, + prim, + prim_addr); if (hit) { /* Move on to next entry in intersections array. */ isect_array++; @@ -189,6 +198,7 @@ ccl_device_inline isect_array, P, dir, + tmin, isect_t, ray->time, visibility, @@ -232,11 +242,15 @@ ccl_device_inline int object_flag = kernel_data_fetch(object_flag, object); if (object_flag & SD_OBJECT_HAS_VOLUME) { #if BVH_FEATURE(BVH_MOTION) - isect_t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm); + const float t_world_to_instance = bvh_instance_motion_push( + kg, object, ray, &P, &dir, &idir, &ob_itfm); #else - isect_t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir); + const float t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir); #endif + isect_t *= t_world_to_instance; + tmin *= t_world_to_instance; + num_hits_in_instance = 0; isect_array->t = isect_t; @@ -280,7 +294,8 @@ ccl_device_inline #endif } - isect_t = tmax; + tmin = ray->tmin; + isect_t = ray->tmax; isect_array->t = isect_t; object = OBJECT_NONE; diff --git a/intern/cycles/kernel/camera/camera.h b/intern/cycles/kernel/camera/camera.h index 25960a94ddb..926ccf7b86f 100644 --- a/intern/cycles/kernel/camera/camera.h +++ b/intern/cycles/kernel/camera/camera.h @@ -165,9 +165,11 @@ ccl_device void camera_sample_perspective(KernelGlobals kg, float nearclip = kernel_data.cam.nearclip * z_inv; ray->P += nearclip * ray->D; ray->dP += nearclip * ray->dD; - ray->t = kernel_data.cam.cliplength * z_inv; + ray->tmin = 0.0f; + ray->tmax = kernel_data.cam.cliplength * z_inv; #else - ray->t = FLT_MAX; + ray->tmin = 0.0f; + ray->tmax = FLT_MAX; #endif } @@ -231,9 +233,11 @@ ccl_device void camera_sample_orthographic(KernelGlobals kg, #ifdef __CAMERA_CLIPPING__ /* clipping */ - ray->t = kernel_data.cam.cliplength; + ray->tmin = 0.0f; + ray->tmax = kernel_data.cam.cliplength; #else - ray->t = FLT_MAX; + ray->tmin = 0.0f; + ray->tmax = FLT_MAX; #endif } @@ -258,7 +262,7 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam, /* indicates ray should not receive any light, outside of the lens */ if (is_zero(D)) { - ray->t = 0.0f; + ray->tmax = 0.0f; return; } @@ -349,9 +353,11 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam, float nearclip = cam->nearclip; ray->P += nearclip * ray->D; ray->dP += nearclip * ray->dD; - ray->t = cam->cliplength; + ray->tmin = 0.0f; + ray->tmax = cam->cliplength; #else - ray->t = FLT_MAX; + ray->tmin = 0.0f; + ray->tmax = FLT_MAX; #endif } @@ -368,7 +374,7 @@ ccl_device_inline void camera_sample(KernelGlobals kg, ccl_private Ray *ray) { /* pixel filter */ - int filter_table_offset = kernel_data.film.filter_table_offset; + int filter_table_offset = kernel_data.tables.filter_table_offset; float raster_x = x + lookup_table_read(kg, filter_u, filter_table_offset, FILTER_TABLE_SIZE); float raster_y = y + lookup_table_read(kg, filter_v, filter_table_offset, FILTER_TABLE_SIZE); diff --git a/intern/cycles/kernel/data_template.h b/intern/cycles/kernel/data_template.h new file mode 100644 index 00000000000..b06ac62a5d8 --- /dev/null +++ b/intern/cycles/kernel/data_template.h @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#ifndef KERNEL_STRUCT_BEGIN +# define KERNEL_STRUCT_BEGIN(name, parent) +#endif +#ifndef KERNEL_STRUCT_END +# define KERNEL_STRUCT_END(name) +#endif +#ifndef KERNEL_STRUCT_MEMBER +# define KERNEL_STRUCT_MEMBER(parent, type, name) +#endif + +/* Background. */ + +KERNEL_STRUCT_BEGIN(KernelBackground, background) +/* xyz store direction, w the angle. float4 instead of float3 is used + * to ensure consistent padding/alignment across devices. */ +KERNEL_STRUCT_MEMBER(background, float4, sun) +/* Only shader index. */ +KERNEL_STRUCT_MEMBER(background, int, surface_shader) +KERNEL_STRUCT_MEMBER(background, int, volume_shader) +KERNEL_STRUCT_MEMBER(background, float, volume_step_size) +KERNEL_STRUCT_MEMBER(background, int, transparent) +KERNEL_STRUCT_MEMBER(background, float, transparent_roughness_squared_threshold) +/* Portal sampling. */ +KERNEL_STRUCT_MEMBER(background, float, portal_weight) +KERNEL_STRUCT_MEMBER(background, int, num_portals) +KERNEL_STRUCT_MEMBER(background, int, portal_offset) +/* Sun sampling. */ +KERNEL_STRUCT_MEMBER(background, float, sun_weight) +/* Importance map sampling. */ +KERNEL_STRUCT_MEMBER(background, float, map_weight) +KERNEL_STRUCT_MEMBER(background, int, map_res_x) +KERNEL_STRUCT_MEMBER(background, int, map_res_y) +/* Multiple importance sampling. */ +KERNEL_STRUCT_MEMBER(background, int, use_mis) +/* Lightgroup. */ +KERNEL_STRUCT_MEMBER(background, int, lightgroup) +/* Padding. */ +KERNEL_STRUCT_MEMBER(background, int, pad1) +KERNEL_STRUCT_MEMBER(background, int, pad2) +KERNEL_STRUCT_MEMBER(background, int, pad3) +KERNEL_STRUCT_END(KernelBackground) + +/* BVH: own BVH2 if no native device acceleration struct used. */ + +KERNEL_STRUCT_BEGIN(KernelBVH, bvh) +KERNEL_STRUCT_MEMBER(bvh, int, root) +KERNEL_STRUCT_MEMBER(bvh, int, have_motion) +KERNEL_STRUCT_MEMBER(bvh, int, have_curves) +KERNEL_STRUCT_MEMBER(bvh, int, bvh_layout) +KERNEL_STRUCT_MEMBER(bvh, int, use_bvh_steps) +KERNEL_STRUCT_MEMBER(bvh, int, curve_subdivisions) +KERNEL_STRUCT_MEMBER(bvh, int, pad1) +KERNEL_STRUCT_MEMBER(bvh, int, pad2) +KERNEL_STRUCT_END(KernelBVH) + +/* Film. */ + +KERNEL_STRUCT_BEGIN(KernelFilm, film) +/* XYZ to rendering color space transform. float4 instead of float3 to + * ensure consistent padding/alignment across devices. */ +KERNEL_STRUCT_MEMBER(film, float4, xyz_to_r) +KERNEL_STRUCT_MEMBER(film, float4, xyz_to_g) +KERNEL_STRUCT_MEMBER(film, float4, xyz_to_b) +KERNEL_STRUCT_MEMBER(film, float4, rgb_to_y) +/* Rec709 to rendering color space. */ +KERNEL_STRUCT_MEMBER(film, float4, rec709_to_r) +KERNEL_STRUCT_MEMBER(film, float4, rec709_to_g) +KERNEL_STRUCT_MEMBER(film, float4, rec709_to_b) +KERNEL_STRUCT_MEMBER(film, int, is_rec709) +/* Exposuse. */ +KERNEL_STRUCT_MEMBER(film, float, exposure) +/* Passed used. */ +KERNEL_STRUCT_MEMBER(film, int, pass_flag) +KERNEL_STRUCT_MEMBER(film, int, light_pass_flag) +/* Pass offsets. */ +KERNEL_STRUCT_MEMBER(film, int, pass_stride) +KERNEL_STRUCT_MEMBER(film, int, pass_combined) +KERNEL_STRUCT_MEMBER(film, int, pass_depth) +KERNEL_STRUCT_MEMBER(film, int, pass_position) +KERNEL_STRUCT_MEMBER(film, int, pass_normal) +KERNEL_STRUCT_MEMBER(film, int, pass_roughness) +KERNEL_STRUCT_MEMBER(film, int, pass_motion) +KERNEL_STRUCT_MEMBER(film, int, pass_motion_weight) +KERNEL_STRUCT_MEMBER(film, int, pass_uv) +KERNEL_STRUCT_MEMBER(film, int, pass_object_id) +KERNEL_STRUCT_MEMBER(film, int, pass_material_id) +KERNEL_STRUCT_MEMBER(film, int, pass_diffuse_color) +KERNEL_STRUCT_MEMBER(film, int, pass_glossy_color) +KERNEL_STRUCT_MEMBER(film, int, pass_transmission_color) +KERNEL_STRUCT_MEMBER(film, int, pass_diffuse_indirect) +KERNEL_STRUCT_MEMBER(film, int, pass_glossy_indirect) +KERNEL_STRUCT_MEMBER(film, int, pass_transmission_indirect) +KERNEL_STRUCT_MEMBER(film, int, pass_volume_indirect) +KERNEL_STRUCT_MEMBER(film, int, pass_diffuse_direct) +KERNEL_STRUCT_MEMBER(film, int, pass_glossy_direct) +KERNEL_STRUCT_MEMBER(film, int, pass_transmission_direct) +KERNEL_STRUCT_MEMBER(film, int, pass_volume_direct) +KERNEL_STRUCT_MEMBER(film, int, pass_emission) +KERNEL_STRUCT_MEMBER(film, int, pass_background) +KERNEL_STRUCT_MEMBER(film, int, pass_ao) +KERNEL_STRUCT_MEMBER(film, float, pass_alpha_threshold) +KERNEL_STRUCT_MEMBER(film, int, pass_shadow) +KERNEL_STRUCT_MEMBER(film, float, pass_shadow_scale) +KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher) +KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher_sample_count) +KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher_matte) +/* Cryptomatte. */ +KERNEL_STRUCT_MEMBER(film, int, cryptomatte_passes) +KERNEL_STRUCT_MEMBER(film, int, cryptomatte_depth) +KERNEL_STRUCT_MEMBER(film, int, pass_cryptomatte) +/* Adaptive sampling. */ +KERNEL_STRUCT_MEMBER(film, int, pass_adaptive_aux_buffer) +KERNEL_STRUCT_MEMBER(film, int, pass_sample_count) +/* Mist. */ +KERNEL_STRUCT_MEMBER(film, int, pass_mist) +KERNEL_STRUCT_MEMBER(film, float, mist_start) +KERNEL_STRUCT_MEMBER(film, float, mist_inv_depth) +KERNEL_STRUCT_MEMBER(film, float, mist_falloff) +/* Denoising. */ +KERNEL_STRUCT_MEMBER(film, int, pass_denoising_normal) +KERNEL_STRUCT_MEMBER(film, int, pass_denoising_albedo) +KERNEL_STRUCT_MEMBER(film, int, pass_denoising_depth) +/* AOVs. */ +KERNEL_STRUCT_MEMBER(film, int, pass_aov_color) +KERNEL_STRUCT_MEMBER(film, int, pass_aov_value) +/* Light groups. */ +KERNEL_STRUCT_MEMBER(film, int, pass_lightgroup) +/* Baking. */ +KERNEL_STRUCT_MEMBER(film, int, pass_bake_primitive) +KERNEL_STRUCT_MEMBER(film, int, pass_bake_differential) +/* Shadow catcher. */ +KERNEL_STRUCT_MEMBER(film, int, use_approximate_shadow_catcher) +/* Padding. */ +KERNEL_STRUCT_MEMBER(film, int, pad1) +KERNEL_STRUCT_MEMBER(film, int, pad2) +KERNEL_STRUCT_END(KernelFilm) + +/* Integrator. */ + +KERNEL_STRUCT_BEGIN(KernelIntegrator, integrator) +/* Emission. */ +KERNEL_STRUCT_MEMBER(integrator, int, use_direct_light) +KERNEL_STRUCT_MEMBER(integrator, int, num_distribution) +KERNEL_STRUCT_MEMBER(integrator, int, num_all_lights) +KERNEL_STRUCT_MEMBER(integrator, float, pdf_triangles) +KERNEL_STRUCT_MEMBER(integrator, float, pdf_lights) +KERNEL_STRUCT_MEMBER(integrator, float, light_inv_rr_threshold) +/* Bounces. */ +KERNEL_STRUCT_MEMBER(integrator, int, min_bounce) +KERNEL_STRUCT_MEMBER(integrator, int, max_bounce) +KERNEL_STRUCT_MEMBER(integrator, int, max_diffuse_bounce) +KERNEL_STRUCT_MEMBER(integrator, int, max_glossy_bounce) +KERNEL_STRUCT_MEMBER(integrator, int, max_transmission_bounce) +KERNEL_STRUCT_MEMBER(integrator, int, max_volume_bounce) +/* AO bounces. */ +KERNEL_STRUCT_MEMBER(integrator, int, ao_bounces) +KERNEL_STRUCT_MEMBER(integrator, float, ao_bounces_distance) +KERNEL_STRUCT_MEMBER(integrator, float, ao_bounces_factor) +KERNEL_STRUCT_MEMBER(integrator, float, ao_additive_factor) +/* Transparency. */ +KERNEL_STRUCT_MEMBER(integrator, int, transparent_min_bounce) +KERNEL_STRUCT_MEMBER(integrator, int, transparent_max_bounce) +KERNEL_STRUCT_MEMBER(integrator, int, transparent_shadows) +/* Caustics. */ +KERNEL_STRUCT_MEMBER(integrator, int, caustics_reflective) +KERNEL_STRUCT_MEMBER(integrator, int, caustics_refractive) +KERNEL_STRUCT_MEMBER(integrator, float, filter_glossy) +/* Seed. */ +KERNEL_STRUCT_MEMBER(integrator, int, seed) +/* Clamp. */ +KERNEL_STRUCT_MEMBER(integrator, float, sample_clamp_direct) +KERNEL_STRUCT_MEMBER(integrator, float, sample_clamp_indirect) +/* MIS. */ +KERNEL_STRUCT_MEMBER(integrator, int, use_lamp_mis) +/* Caustics. */ +KERNEL_STRUCT_MEMBER(integrator, int, use_caustics) +/* Sampling pattern. */ +KERNEL_STRUCT_MEMBER(integrator, int, sampling_pattern) +KERNEL_STRUCT_MEMBER(integrator, float, scrambling_distance) +/* Volume render. */ +KERNEL_STRUCT_MEMBER(integrator, int, use_volumes) +KERNEL_STRUCT_MEMBER(integrator, int, volume_max_steps) +KERNEL_STRUCT_MEMBER(integrator, float, volume_step_rate) +/* Shadow catcher. */ +KERNEL_STRUCT_MEMBER(integrator, int, has_shadow_catcher) +/* Closure filter. */ +KERNEL_STRUCT_MEMBER(integrator, int, filter_closures) +/* MIS debugging. */ +KERNEL_STRUCT_MEMBER(integrator, int, direct_light_sampling_type) +/* Padding */ +KERNEL_STRUCT_MEMBER(integrator, int, pad1) +KERNEL_STRUCT_END(KernelIntegrator) + +/* SVM. For shader specialization. */ + +KERNEL_STRUCT_BEGIN(KernelSVMUsage, svm_usage) +#define SHADER_NODE_TYPE(type) KERNEL_STRUCT_MEMBER(svm_usage, int, type) +#include "kernel/svm/node_types_template.h" +KERNEL_STRUCT_END(KernelSVMUsage) + +#undef KERNEL_STRUCT_BEGIN +#undef KERNEL_STRUCT_MEMBER +#undef KERNEL_STRUCT_END diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index b9a44ccad02..e1ab802aa80 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -246,7 +246,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) ccl_gpu_kernel_postfix #if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__) -constant int __dummy_constant [[function_constant(0)]]; +constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]]; #endif ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) diff --git a/intern/cycles/kernel/device/metal/function_constants.h b/intern/cycles/kernel/device/metal/function_constants.h new file mode 100644 index 00000000000..3adf390c7f6 --- /dev/null +++ b/intern/cycles/kernel/device/metal/function_constants.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Blender Foundation */ + +enum { + Kernel_DummyConstant, +#define KERNEL_STRUCT_MEMBER(parent, type, name) KernelData_##parent##_##name, +#include "kernel/data_template.h" +}; + +#ifdef __KERNEL_METAL__ +# define KERNEL_STRUCT_MEMBER(parent, type, name) \ + constant type kernel_data_##parent##_##name \ + [[function_constant(KernelData_##parent##_##name)]]; +# include "kernel/data_template.h" +#endif diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal index 3c31dc3354c..764c26dbe8f 100644 --- a/intern/cycles/kernel/device/metal/kernel.metal +++ b/intern/cycles/kernel/device/metal/kernel.metal @@ -5,6 +5,7 @@ #include "kernel/device/metal/compat.h" #include "kernel/device/metal/globals.h" +#include "kernel/device/metal/function_constants.h" #include "kernel/device/gpu/kernel.h" /* MetalRT intersection handlers */ @@ -409,6 +410,7 @@ void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal, const float3 ray_origin, const float3 ray_direction, float time, + const float ray_tmin, const float ray_tmax, thread BoundingBoxIntersectionResult &result) { @@ -433,7 +435,7 @@ void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal, isect.t *= len; MetalKernelContext context(launch_params_metal); - if (context.curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) { + if (context.curve_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) { result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>( launch_params_metal, payload, object, prim, isect.u); if (result.accept) { @@ -455,6 +457,7 @@ void metalrt_intersection_curve_shadow(constant KernelParamsMetal &launch_params const float3 ray_origin, const float3 ray_direction, float time, + const float ray_tmin, const float ray_tmax, thread BoundingBoxIntersectionResult &result) { @@ -474,7 +477,7 @@ void metalrt_intersection_curve_shadow(constant KernelParamsMetal &launch_params isect.t *= len; MetalKernelContext context(launch_params_metal); - if (context.curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) { + if (context.curve_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) { result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>( launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax); result.accept = !result.continue_search; @@ -493,6 +496,7 @@ __intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[b const uint primitive_id [[primitive_id]], const float3 ray_origin [[origin]], const float3 ray_direction [[direction]], + const float ray_tmin [[min_distance]], const float ray_tmax [[max_distance]]) { uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object); @@ -510,7 +514,7 @@ __intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[b # else 0.0f, # endif - ray_tmax, result); + ray_tmin, ray_tmax, result); } return result; @@ -524,6 +528,7 @@ __intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_me const uint primitive_id [[primitive_id]], const float3 ray_origin [[origin]], const float3 ray_direction [[direction]], + const float ray_tmin [[min_distance]], const float ray_tmax [[max_distance]]) { uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object); @@ -541,7 +546,7 @@ __intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_me # else 0.0f, # endif - ray_tmax, result); + ray_tmin, ray_tmax, result); } return result; @@ -555,6 +560,7 @@ __intersection__curve_all(constant KernelParamsMetal &launch_params_metal [[buff const uint primitive_id [[primitive_id]], const float3 ray_origin [[origin]], const float3 ray_direction [[direction]], + const float ray_tmin [[min_distance]], const float ray_tmax [[max_distance]]) { uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object); @@ -570,7 +576,7 @@ __intersection__curve_all(constant KernelParamsMetal &launch_params_metal [[buff # else 0.0f, # endif - ray_tmax, result); + ray_tmin, ray_tmax, result); return result; } @@ -583,6 +589,7 @@ __intersection__curve_all_shadow(constant KernelParamsMetal &launch_params_metal const uint primitive_id [[primitive_id]], const float3 ray_origin [[origin]], const float3 ray_direction [[direction]], + const float ray_tmin [[min_distance]], const float ray_tmax [[max_distance]]) { uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object); @@ -599,7 +606,7 @@ __intersection__curve_all_shadow(constant KernelParamsMetal &launch_params_metal # else 0.0f, # endif - ray_tmax, result); + ray_tmin, ray_tmax, result); return result; } @@ -615,6 +622,7 @@ void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal, const float3 ray_origin, const float3 ray_direction, float time, + const float ray_tmin, const float ray_tmax, thread BoundingBoxIntersectionResult &result) { @@ -639,7 +647,7 @@ void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal, isect.t *= len; MetalKernelContext context(launch_params_metal); - if (context.point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) { + if (context.point_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) { result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>( launch_params_metal, payload, object, prim, isect.u); if (result.accept) { @@ -661,6 +669,7 @@ void metalrt_intersection_point_shadow(constant KernelParamsMetal &launch_params const float3 ray_origin, const float3 ray_direction, float time, + const float ray_tmin, const float ray_tmax, thread BoundingBoxIntersectionResult &result) { @@ -680,7 +689,7 @@ void metalrt_intersection_point_shadow(constant KernelParamsMetal &launch_params isect.t *= len; MetalKernelContext context(launch_params_metal); - if (context.point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) { + if (context.point_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) { result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>( launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax); result.accept = !result.continue_search; @@ -699,6 +708,7 @@ __intersection__point(constant KernelParamsMetal &launch_params_metal [[buffer(1 const uint primitive_id [[primitive_id]], const float3 ray_origin [[origin]], const float3 ray_direction [[direction]], + const float ray_tmin [[min_distance]], const float ray_tmax [[max_distance]]) { const uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object); @@ -715,7 +725,7 @@ __intersection__point(constant KernelParamsMetal &launch_params_metal [[buffer(1 # else 0.0f, # endif - ray_tmax, result); + ray_tmin, ray_tmax, result); return result; } @@ -728,6 +738,7 @@ __intersection__point_shadow(constant KernelParamsMetal &launch_params_metal [[b const uint primitive_id [[primitive_id]], const float3 ray_origin [[origin]], const float3 ray_direction [[direction]], + const float ray_tmin [[min_distance]], const float ray_tmax [[max_distance]]) { const uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object); @@ -744,7 +755,7 @@ __intersection__point_shadow(constant KernelParamsMetal &launch_params_metal [[b # else 0.0f, # endif - ray_tmax, result); + ray_tmin, ray_tmax, result); return result; } diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp index 82910d72105..300e201600c 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.cpp +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -670,10 +670,7 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, } static const int lowest_supported_driver_version_win = 1011660; -/* TODO: once Linux JIT compilation crash from CentOS generated spv is fixed, adjust version below. - * Until then, set CYCLES_ONEAPI_ALL_DEVICES environment variable to avoid getting it filtered out. - */ -static const int lowest_supported_driver_version_neo = 28000; +static const int lowest_supported_driver_version_neo = 23570; static int parse_driver_build_version(const sycl::device &device) { diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu index 949bf41d171..510f7cca5d6 100644 --- a/intern/cycles/kernel/device/optix/kernel.cu +++ b/intern/cycles/kernel/device/optix/kernel.cu @@ -51,32 +51,36 @@ ccl_device_forceinline int get_object_id() extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest() { const int global_index = optixGetLaunchIndex().x; - const int path_index = (kernel_params.path_index_array) ? kernel_params.path_index_array[global_index] : - global_index; + const int path_index = (kernel_params.path_index_array) ? + kernel_params.path_index_array[global_index] : + global_index; integrator_intersect_closest(nullptr, path_index, kernel_params.render_buffer); } extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow() { const int global_index = optixGetLaunchIndex().x; - const int path_index = (kernel_params.path_index_array) ? kernel_params.path_index_array[global_index] : - global_index; + const int path_index = (kernel_params.path_index_array) ? + kernel_params.path_index_array[global_index] : + global_index; integrator_intersect_shadow(nullptr, path_index); } extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_subsurface() { const int global_index = optixGetLaunchIndex().x; - const int path_index = (kernel_params.path_index_array) ? kernel_params.path_index_array[global_index] : - global_index; + const int path_index = (kernel_params.path_index_array) ? + kernel_params.path_index_array[global_index] : + global_index; integrator_intersect_subsurface(nullptr, path_index); } extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_stack() { const int global_index = optixGetLaunchIndex().x; - const int path_index = (kernel_params.path_index_array) ? kernel_params.path_index_array[global_index] : - global_index; + const int path_index = (kernel_params.path_index_array) ? + kernel_params.path_index_array[global_index] : + global_index; integrator_intersect_volume_stack(nullptr, path_index); } @@ -408,6 +412,7 @@ ccl_device_inline void optix_intersection_curve(const int prim, const int type) float3 P = optixGetObjectRayOrigin(); float3 dir = optixGetObjectRayDirection(); + float tmin = optixGetRayTmin(); /* The direction is not normalized by default, but the curve intersection routine expects that */ float len; @@ -425,7 +430,7 @@ ccl_device_inline void optix_intersection_curve(const int prim, const int type) if (isect.t != FLT_MAX) isect.t *= len; - if (curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) { + if (curve_intersect(NULL, &isect, P, dir, tmin, isect.t, object, prim, time, type)) { static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use"); optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL, @@ -462,6 +467,7 @@ extern "C" __global__ void __intersection__point() float3 P = optixGetObjectRayOrigin(); float3 dir = optixGetObjectRayDirection(); + float tmin = optixGetRayTmin(); /* The direction is not normalized by default, the point intersection routine expects that. */ float len; @@ -480,7 +486,7 @@ extern "C" __global__ void __intersection__point() isect.t *= len; } - if (point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) { + if (point_intersect(NULL, &isect, P, dir, tmin, isect.t, object, prim, time, type)) { static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use"); optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL); } diff --git a/intern/cycles/kernel/geom/curve_intersect.h b/intern/cycles/kernel/geom/curve_intersect.h index 001bec01749..9770105dd81 100644 --- a/intern/cycles/kernel/geom/curve_intersect.h +++ b/intern/cycles/kernel/geom/curve_intersect.h @@ -156,7 +156,8 @@ ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, co } ccl_device bool curve_intersect_iterative(const float3 ray_dir, - ccl_private float *ray_tfar, + const float ray_tmin, + ccl_private float *ray_tmax, const float dt, const float4 curve[4], float u, @@ -220,7 +221,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir, if (fabsf(f) < f_err && fabsf(g) < g_err) { t += dt; - if (!(0.0f <= t && t <= *ray_tfar)) { + if (!(t >= ray_tmin && t <= *ray_tmax)) { return false; /* Rejects NaNs */ } if (!(u >= 0.0f && u <= 1.0f)) { @@ -237,7 +238,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir, } /* Record intersection. */ - *ray_tfar = t; + *ray_tmax = t; isect->t = t; isect->u = u; isect->v = 0.0f; @@ -250,7 +251,8 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir, ccl_device bool curve_intersect_recursive(const float3 ray_orig, const float3 ray_dir, - float ray_tfar, + const float ray_tmin, + float ray_tmax, float4 curve[4], ccl_private Intersection *isect) { @@ -331,7 +333,7 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig, } /* Intersect with cap-planes. */ - float2 tp = make_float2(-dt, ray_tfar - dt); + float2 tp = make_float2(ray_tmin - dt, ray_tmax - dt); tp = make_float2(max(tp.x, tc_outer.x), min(tp.y, tc_outer.y)); const float2 h0 = half_plane_intersect( float4_to_float3(P0), float4_to_float3(dP0du), ray_dir); @@ -394,19 +396,20 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig, CURVE_NUM_BEZIER_SUBDIVISIONS; if (depth >= termDepth) { found |= curve_intersect_iterative( - ray_dir, &ray_tfar, dt, curve, u_outer0, tp0.x, use_backfacing, isect); + ray_dir, ray_tmin, &ray_tmax, dt, curve, u_outer0, tp0.x, use_backfacing, isect); } else { recurse = true; } } - if (valid1 && (tp1.x + dt <= ray_tfar)) { + const float t1 = tp1.x + dt; + if (valid1 && (t1 >= ray_tmin && t1 <= ray_tmax)) { const int termDepth = unstable1 ? CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE : CURVE_NUM_BEZIER_SUBDIVISIONS; if (depth >= termDepth) { found |= curve_intersect_iterative( - ray_dir, &ray_tfar, dt, curve, u_outer1, tp1.y, use_backfacing, isect); + ray_dir, ray_tmin, &ray_tmax, dt, curve, u_outer1, tp1.y, use_backfacing, isect); } else { recurse = true; @@ -456,7 +459,8 @@ ccl_device_inline bool cylinder_culling_test(const float2 p1, const float2 p2, c * v0,v1,v3 and v2,v3,v1. The edge v1,v2 decides which of the two * triangles gets intersected. */ -ccl_device_inline bool ribbon_intersect_quad(const float ray_tfar, +ccl_device_inline bool ribbon_intersect_quad(const float ray_tmin, + const float ray_tmax, const float3 quad_v0, const float3 quad_v1, const float3 quad_v2, @@ -497,7 +501,7 @@ ccl_device_inline bool ribbon_intersect_quad(const float ray_tfar, /* Perform depth test? */ const float t = rcpDen * dot(v0, Ng); - if (!(0.0f <= t && t <= ray_tfar)) { + if (!(t >= ray_tmin && t <= ray_tmax)) { return false; } @@ -534,7 +538,8 @@ ccl_device_inline float4 ribbon_to_ray_space(const float3 ray_space[3], ccl_device_inline bool ribbon_intersect(const float3 ray_org, const float3 ray_dir, - float ray_tfar, + const float ray_tmin, + float ray_tmax, const int N, float4 curve[4], ccl_private Intersection *isect) @@ -582,7 +587,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org, /* Intersect quad. */ float vu, vv, vt; - bool valid0 = ribbon_intersect_quad(ray_tfar, lp0, lp1, up1, up0, &vu, &vv, &vt); + bool valid0 = ribbon_intersect_quad(ray_tmin, ray_tmax, lp0, lp1, up1, up0, &vu, &vv, &vt); if (valid0) { /* ignore self intersections */ @@ -596,7 +601,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org, vv = 2.0f * vv - 1.0f; /* Record intersection. */ - ray_tfar = vt; + ray_tmax = vt; isect->t = vt; isect->u = u + vu * step_size; isect->v = vv; @@ -616,6 +621,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg, ccl_private Intersection *isect, const float3 P, const float3 dir, + const float tmin, const float tmax, int object, int prim, @@ -645,7 +651,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg, if (type & PRIMITIVE_CURVE_RIBBON) { /* todo: adaptive number of subdivisions could help performance here. */ const int subdivisions = kernel_data.bvh.curve_subdivisions; - if (ribbon_intersect(P, dir, tmax, subdivisions, curve, isect)) { + if (ribbon_intersect(P, dir, tmin, tmax, subdivisions, curve, isect)) { isect->prim = prim; isect->object = object; isect->type = type; @@ -655,7 +661,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg, return false; } else { - if (curve_intersect_recursive(P, dir, tmax, curve, isect)) { + if (curve_intersect_recursive(P, dir, tmin, tmax, curve, isect)) { isect->prim = prim; isect->object = object; isect->type = type; diff --git a/intern/cycles/kernel/geom/motion_triangle_intersect.h b/intern/cycles/kernel/geom/motion_triangle_intersect.h index 6eea5096567..b59c5c43c20 100644 --- a/intern/cycles/kernel/geom/motion_triangle_intersect.h +++ b/intern/cycles/kernel/geom/motion_triangle_intersect.h @@ -46,6 +46,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals kg, ccl_private Intersection *isect, float3 P, float3 dir, + float tmin, float tmax, float time, uint visibility, @@ -58,7 +59,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals kg, motion_triangle_vertices(kg, object, prim, time, verts); /* Ray-triangle intersection, unoptimized. */ float t, u, v; - if (ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) { + if (ray_triangle_intersect(P, dir, tmin, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) { #ifdef __VISIBILITY_FLAG__ /* Visibility flag test. we do it here under the assumption * that most triangles are culled by node flags. @@ -92,6 +93,7 @@ ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals kg, int object, int prim, int prim_addr, + float tmin, float tmax, ccl_private uint *lcg_state, int max_hits) @@ -101,7 +103,7 @@ ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals kg, motion_triangle_vertices(kg, object, prim, time, verts); /* Ray-triangle intersection, unoptimized. */ float t, u, v; - if (!ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) { + if (!ray_triangle_intersect(P, dir, tmin, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) { return false; } diff --git a/intern/cycles/kernel/geom/point_intersect.h b/intern/cycles/kernel/geom/point_intersect.h index dfd9d9a015b..ee5a564947b 100644 --- a/intern/cycles/kernel/geom/point_intersect.h +++ b/intern/cycles/kernel/geom/point_intersect.h @@ -9,8 +9,12 @@ CCL_NAMESPACE_BEGIN #ifdef __POINTCLOUD__ -ccl_device_forceinline bool point_intersect_test( - const float4 point, const float3 P, const float3 dir, const float tmax, ccl_private float *t) +ccl_device_forceinline bool point_intersect_test(const float4 point, + const float3 P, + const float3 dir, + const float tmin, + const float tmax, + ccl_private float *t) { const float3 center = float4_to_float3(point); const float radius = point.w; @@ -28,12 +32,12 @@ ccl_device_forceinline bool point_intersect_test( const float td = sqrt((r2 - l2) * rd2); const float t_front = projC0 - td; - const bool valid_front = (0.0f <= t_front) & (t_front <= tmax); + const bool valid_front = (tmin <= t_front) & (t_front <= tmax); /* Always back-face culling for now. */ # if 0 const float t_back = projC0 + td; - const bool valid_back = (0.0f <= t_back) & (t_back <= tmax); + const bool valid_back = (tmin <= t_back) & (t_back <= tmax); /* check if there is a first hit */ const bool valid_first = valid_front | valid_back; @@ -56,6 +60,7 @@ ccl_device_forceinline bool point_intersect(KernelGlobals kg, ccl_private Intersection *isect, const float3 P, const float3 dir, + const float tmin, const float tmax, const int object, const int prim, @@ -65,7 +70,7 @@ ccl_device_forceinline bool point_intersect(KernelGlobals kg, const float4 point = (type & PRIMITIVE_MOTION) ? motion_point(kg, object, prim, time) : kernel_data_fetch(points, prim); - if (!point_intersect_test(point, P, dir, tmax, &isect->t)) { + if (!point_intersect_test(point, P, dir, tmin, tmax, &isect->t)) { return false; } diff --git a/intern/cycles/kernel/geom/shader_data.h b/intern/cycles/kernel/geom/shader_data.h index e5dbeac5e66..99b9289cb4a 100644 --- a/intern/cycles/kernel/geom/shader_data.h +++ b/intern/cycles/kernel/geom/shader_data.h @@ -407,7 +407,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals kg, { /* vectors */ - sd->P = ray->P; + sd->P = ray->P + ray->D * ray->tmin; sd->N = -ray->D; sd->Ng = -ray->D; sd->I = -ray->D; @@ -441,7 +441,6 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals kg, /* for NDC coordinates */ sd->ray_P = ray->P; - sd->ray_dP = ray->dP; } #endif /* __VOLUME__ */ diff --git a/intern/cycles/kernel/geom/triangle_intersect.h b/intern/cycles/kernel/geom/triangle_intersect.h index 0c76de9ccc7..f968e537cfa 100644 --- a/intern/cycles/kernel/geom/triangle_intersect.h +++ b/intern/cycles/kernel/geom/triangle_intersect.h @@ -17,6 +17,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals kg, ccl_private Intersection *isect, float3 P, float3 dir, + float tmin, float tmax, uint visibility, int object, @@ -28,7 +29,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals kg, tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1), tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2); float t, u, v; - if (ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) { + if (ray_triangle_intersect(P, dir, tmin, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) { #ifdef __VISIBILITY_FLAG__ /* Visibility flag test. we do it here under the assumption * that most triangles are culled by node flags. @@ -62,6 +63,7 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg, int object, int prim, int prim_addr, + float tmin, float tmax, ccl_private uint *lcg_state, int max_hits) @@ -71,7 +73,7 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg, tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1), tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2); float t, u, v; - if (!ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) { + if (!ray_triangle_intersect(P, dir, tmin, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) { return false; } diff --git a/intern/cycles/kernel/integrator/init_from_bake.h b/intern/cycles/kernel/integrator/init_from_bake.h index c63684d58e6..bf3f41b52b9 100644 --- a/intern/cycles/kernel/integrator/init_from_bake.h +++ b/intern/cycles/kernel/integrator/init_from_bake.h @@ -174,14 +174,15 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg, Ray ray ccl_optional_struct_init; ray.P = zero_float3(); ray.D = normalize(P); - ray.t = FLT_MAX; + ray.tmin = 0.0f; + ray.tmax = FLT_MAX; ray.time = 0.5f; ray.dP = differential_zero_compact(); ray.dD = differential_zero_compact(); integrator_state_write_ray(kg, state, &ray); /* Setup next kernel to execute. */ - INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); + integrator_path_init(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); } else { /* Surface baking. */ @@ -210,7 +211,8 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg, Ray ray ccl_optional_struct_init; ray.P = P + N; ray.D = -N; - ray.t = FLT_MAX; + ray.tmin = 0.0f; + ray.tmax = FLT_MAX; ray.time = 0.5f; /* Setup differentials. */ @@ -247,13 +249,15 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg, const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE); if (use_caustics) { - INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader_index); + integrator_path_init_sorted( + kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader_index); } else if (use_raytrace_kernel) { - INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader_index); + integrator_path_init_sorted( + kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader_index); } else { - INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader_index); + integrator_path_init_sorted(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader_index); } } diff --git a/intern/cycles/kernel/integrator/init_from_camera.h b/intern/cycles/kernel/integrator/init_from_camera.h index 9fe27cdda9a..e89ab3991c7 100644 --- a/intern/cycles/kernel/integrator/init_from_camera.h +++ b/intern/cycles/kernel/integrator/init_from_camera.h @@ -86,7 +86,7 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg, /* Generate camera ray. */ Ray ray; integrate_camera_sample(kg, sample, x, y, rng_hash, &ray); - if (ray.t == 0.0f) { + if (ray.tmax == 0.0f) { return true; } @@ -100,10 +100,10 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg, /* Continue with intersect_closest kernel, optionally initializing volume * stack before that if the camera may be inside a volume. */ if (kernel_data.cam.is_inside_volume) { - INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK); + integrator_path_init(kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK); } else { - INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST); + integrator_path_init(kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST); } return true; diff --git a/intern/cycles/kernel/integrator/intersect_closest.h b/intern/cycles/kernel/integrator/intersect_closest.h index 621aa05f46b..60299f2cb2f 100644 --- a/intern/cycles/kernel/integrator/intersect_closest.h +++ b/intern/cycles/kernel/integrator/intersect_closest.h @@ -109,14 +109,14 @@ ccl_device_forceinline void integrator_split_shadow_catcher( /* If using background pass, schedule background shading kernel so that we have a background * to alpha-over on. The background kernel will then continue the path afterwards. */ INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND; - INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); + integrator_path_init(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); return; } if (!integrator_state_volume_stack_is_empty(kg, state)) { /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher * objects from it, and then continue shading volume and shadow catcher surface after. */ - INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK); + integrator_path_init(kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK); return; } @@ -128,18 +128,19 @@ ccl_device_forceinline void integrator_split_shadow_catcher( const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE); if (use_caustics) { - INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader); + integrator_path_init_sorted(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader); } else if (use_raytrace_kernel) { - INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); + integrator_path_init_sorted( + kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); } else { - INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader); + integrator_path_init_sorted(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader); } } /* Schedule next kernel to be executed after updating volume stack for shadow catcher. */ -template<uint32_t current_kernel> +template<DeviceKernel current_kernel> ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_volume( KernelGlobals kg, IntegratorState state) { @@ -156,20 +157,21 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catche const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE); if (use_caustics) { - INTEGRATOR_PATH_NEXT_SORTED( - current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader); + integrator_path_next_sorted( + kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader); } else if (use_raytrace_kernel) { - INTEGRATOR_PATH_NEXT_SORTED( - current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); + integrator_path_next_sorted( + kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); } else { - INTEGRATOR_PATH_NEXT_SORTED(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader); + integrator_path_next_sorted( + kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader); } } /* Schedule next kernel to be executed after executing background shader for shadow catcher. */ -template<uint32_t current_kernel> +template<DeviceKernel current_kernel> ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_background( KernelGlobals kg, IntegratorState state) { @@ -177,7 +179,8 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catche if (!integrator_state_volume_stack_is_empty(kg, state)) { /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher * objects from it, and then continue shading volume and shadow catcher surface after. */ - INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK); + integrator_path_next( + kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK); return; } @@ -190,7 +193,7 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catche * * Note that current_kernel is a template value since making this a variable * leads to poor performance with CUDA atomics. */ -template<uint32_t current_kernel> +template<DeviceKernel current_kernel> ccl_device_forceinline void integrator_intersect_next_kernel( KernelGlobals kg, IntegratorState state, @@ -206,10 +209,10 @@ ccl_device_forceinline void integrator_intersect_next_kernel( const int flags = (hit_surface) ? kernel_data_fetch(shaders, shader).flags : 0; if (!integrator_intersect_terminate(kg, state, flags)) { - INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME); + integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME); } else { - INTEGRATOR_PATH_TERMINATE(current_kernel); + integrator_path_terminate(kg, state, current_kernel); } return; } @@ -218,7 +221,7 @@ ccl_device_forceinline void integrator_intersect_next_kernel( if (hit) { /* Hit a surface, continue with light or surface kernel. */ if (isect->type & PRIMITIVE_LAMP) { - INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT); + integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT); } else { /* Hit a surface, continue with surface kernel unless terminated. */ @@ -231,16 +234,16 @@ ccl_device_forceinline void integrator_intersect_next_kernel( (object_flags & SD_OBJECT_CAUSTICS); const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE); if (use_caustics) { - INTEGRATOR_PATH_NEXT_SORTED( - current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader); + integrator_path_next_sorted( + kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader); } else if (use_raytrace_kernel) { - INTEGRATOR_PATH_NEXT_SORTED( - current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); + integrator_path_next_sorted( + kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); } else { - INTEGRATOR_PATH_NEXT_SORTED( - current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader); + integrator_path_next_sorted( + kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader); } #ifdef __SHADOW_CATCHER__ @@ -249,13 +252,13 @@ ccl_device_forceinline void integrator_intersect_next_kernel( #endif } else { - INTEGRATOR_PATH_TERMINATE(current_kernel); + integrator_path_terminate(kg, state, current_kernel); } } } else { /* Nothing hit, continue with background kernel. */ - INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); + integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); } } @@ -263,7 +266,7 @@ ccl_device_forceinline void integrator_intersect_next_kernel( * * The logic here matches integrator_intersect_next_kernel, except that * volume shading and termination testing have already been done. */ -template<uint32_t current_kernel> +template<DeviceKernel current_kernel> ccl_device_forceinline void integrator_intersect_next_kernel_after_volume( KernelGlobals kg, IntegratorState state, @@ -273,7 +276,7 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_volume( if (isect->prim != PRIM_NONE) { /* Hit a surface, continue with light or surface kernel. */ if (isect->type & PRIMITIVE_LAMP) { - INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT); + integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT); return; } else { @@ -286,16 +289,16 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_volume( const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE); if (use_caustics) { - INTEGRATOR_PATH_NEXT_SORTED( - current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader); + integrator_path_next_sorted( + kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader); } else if (use_raytrace_kernel) { - INTEGRATOR_PATH_NEXT_SORTED( - current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); + integrator_path_next_sorted( + kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); } else { - INTEGRATOR_PATH_NEXT_SORTED( - current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader); + integrator_path_next_sorted( + kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader); } #ifdef __SHADOW_CATCHER__ @@ -307,7 +310,7 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_volume( } else { /* Nothing hit, continue with background kernel. */ - INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); + integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); return; } } @@ -321,7 +324,7 @@ ccl_device void integrator_intersect_closest(KernelGlobals kg, /* Read ray from integrator state into local memory. */ Ray ray ccl_optional_struct_init; integrator_state_read_ray(kg, state, &ray); - kernel_assert(ray.t != 0.0f); + kernel_assert(ray.tmax != 0.0f); const uint visibility = path_state_ray_visibility(state); const int last_isect_prim = INTEGRATOR_STATE(state, isect, prim); @@ -329,12 +332,12 @@ ccl_device void integrator_intersect_closest(KernelGlobals kg, /* Trick to use short AO rays to approximate indirect light at the end of the path. */ if (path_state_ao_bounce(kg, state)) { - ray.t = kernel_data.integrator.ao_bounces_distance; + ray.tmax = kernel_data.integrator.ao_bounces_distance; if (last_isect_object != OBJECT_NONE) { const float object_ao_distance = kernel_data_fetch(objects, last_isect_object).ao_distance; if (object_ao_distance != 0.0f) { - ray.t = object_ao_distance; + ray.tmax = object_ao_distance; } } } diff --git a/intern/cycles/kernel/integrator/intersect_shadow.h b/intern/cycles/kernel/integrator/intersect_shadow.h index 3e746998225..1b48b360858 100644 --- a/intern/cycles/kernel/integrator/intersect_shadow.h +++ b/intern/cycles/kernel/integrator/intersect_shadow.h @@ -162,7 +162,7 @@ ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorShadowSt if (opaque_hit) { /* Hit an opaque surface, shadow path ends here. */ - INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW); + integrator_shadow_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW); return; } else { @@ -171,7 +171,9 @@ ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorShadowSt * * TODO: could also write to render buffer directly if no transparent shadows? * Could save a kernel execution for the common case. */ - INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, + integrator_shadow_path_next(kg, + state, + DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW); return; } diff --git a/intern/cycles/kernel/integrator/intersect_subsurface.h b/intern/cycles/kernel/integrator/intersect_subsurface.h index 0a2c4ad680d..f439d6905a0 100644 --- a/intern/cycles/kernel/integrator/intersect_subsurface.h +++ b/intern/cycles/kernel/integrator/intersect_subsurface.h @@ -17,7 +17,7 @@ ccl_device void integrator_intersect_subsurface(KernelGlobals kg, IntegratorStat } #endif - INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE); + integrator_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/intersect_volume_stack.h b/intern/cycles/kernel/integrator/intersect_volume_stack.h index 49ef01dc870..9ba4a0a3964 100644 --- a/intern/cycles/kernel/integrator/intersect_volume_stack.h +++ b/intern/cycles/kernel/integrator/intersect_volume_stack.h @@ -24,7 +24,8 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg, Ray volume_ray ccl_optional_struct_init; volume_ray.P = from_P; - volume_ray.D = normalize_len(to_P - from_P, &volume_ray.t); + volume_ray.D = normalize_len(to_P - from_P, &volume_ray.tmax); + volume_ray.tmin = 0.0f; volume_ray.self.object = INTEGRATOR_STATE(state, isect, object); volume_ray.self.prim = INTEGRATOR_STATE(state, isect, prim); volume_ray.self.light_object = OBJECT_NONE; @@ -58,12 +59,9 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg, volume_stack_enter_exit(kg, state, stack_sd); /* Move ray forward. */ - volume_ray.P = stack_sd->P; + volume_ray.tmin = intersection_t_offset(isect.t); volume_ray.self.object = isect.object; volume_ray.self.prim = isect.prim; - if (volume_ray.t != FLT_MAX) { - volume_ray.D = normalize_len(to_P - volume_ray.P, &volume_ray.t); - } ++step; } #endif @@ -82,7 +80,8 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s /* Trace ray in random direction. Any direction works, Z up is a guess to get the * fewest hits. */ volume_ray.D = make_float3(0.0f, 0.0f, 1.0f); - volume_ray.t = FLT_MAX; + volume_ray.tmin = 0.0f; + volume_ray.tmax = FLT_MAX; volume_ray.self.object = OBJECT_NONE; volume_ray.self.prim = PRIM_NONE; volume_ray.self.light_object = OBJECT_NONE; @@ -199,7 +198,7 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s } /* Move ray forward. */ - volume_ray.P = stack_sd->P; + volume_ray.tmin = intersection_t_offset(isect.t); volume_ray.self.object = isect.object; volume_ray.self.prim = isect.prim; ++step; @@ -222,7 +221,9 @@ ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorSt } else { /* Volume stack init for camera rays, continue with intersection of camera ray. */ - INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK, + integrator_path_next(kg, + state, + DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST); } } diff --git a/intern/cycles/kernel/integrator/mnee.h b/intern/cycles/kernel/integrator/mnee.h index 67505b9b612..7a6f866b1a0 100644 --- a/intern/cycles/kernel/integrator/mnee.h +++ b/intern/cycles/kernel/integrator/mnee.h @@ -137,8 +137,14 @@ ccl_device_forceinline void mnee_update_light_sample(KernelGlobals kg, } } else if (ls->type == LIGHT_AREA) { + float invarea = fabsf(klight->area.invarea); ls->D = normalize_len(ls->P - P, &ls->t); - ls->pdf = fabsf(klight->area.invarea); + ls->pdf = invarea; + if (klight->area.tan_spread > 0.f) { + ls->eval_fac = 0.25f * invarea; + ls->eval_fac *= light_spread_attenuation( + ls->D, ls->Ng, klight->area.tan_spread, klight->area.normalize_spread); + } } ls->pdf *= kernel_data.integrator.pdf_lights; @@ -436,6 +442,7 @@ ccl_device_forceinline bool mnee_newton_solver(KernelGlobals kg, projection_ray.self.light_prim = PRIM_NONE; projection_ray.dP = differential_make_compact(sd->dP); projection_ray.dD = differential_zero_compact(); + projection_ray.tmin = 0.0f; projection_ray.time = sd->time; Intersection projection_isect; @@ -499,8 +506,8 @@ ccl_device_forceinline bool mnee_newton_solver(KernelGlobals kg, projection_ray.self.prim = pv.prim; projection_ray.P = pv.p; } - projection_ray.D = normalize_len(tentative_p - projection_ray.P, &projection_ray.t); - projection_ray.t *= MNEE_PROJECTION_DISTANCE_MULTIPLIER; + projection_ray.D = normalize_len(tentative_p - projection_ray.P, &projection_ray.tmax); + projection_ray.tmax *= MNEE_PROJECTION_DISTANCE_MULTIPLIER; bool projection_success = false; for (int isect_count = 0; isect_count < MNEE_MAX_INTERSECTION_COUNT; isect_count++) { @@ -519,8 +526,7 @@ ccl_device_forceinline bool mnee_newton_solver(KernelGlobals kg, projection_ray.self.object = projection_isect.object; projection_ray.self.prim = projection_isect.prim; - projection_ray.P += projection_isect.t * projection_ray.D; - projection_ray.t -= projection_isect.t; + projection_ray.tmin = intersection_t_offset(projection_isect.t); } if (!projection_success) { reduce_stepsize = true; @@ -852,6 +858,7 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg, Ray probe_ray; probe_ray.self.light_object = ls->object; probe_ray.self.light_prim = ls->prim; + probe_ray.tmin = 0.0f; probe_ray.dP = differential_make_compact(sd->dP); probe_ray.dD = differential_zero_compact(); probe_ray.time = sd->time; @@ -867,13 +874,13 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg, ccl_private const ManifoldVertex &v = vertices[vi]; /* Check visibility. */ - probe_ray.D = normalize_len(v.p - probe_ray.P, &probe_ray.t); + probe_ray.D = normalize_len(v.p - probe_ray.P, &probe_ray.tmax); if (scene_intersect(kg, &probe_ray, PATH_RAY_TRANSMIT, &probe_isect)) { int hit_object = (probe_isect.object == OBJECT_NONE) ? kernel_data_fetch(prim_object, probe_isect.prim) : probe_isect.object; /* Test whether the ray hit the appropriate object at its intended location. */ - if (hit_object != v.object || fabsf(probe_ray.t - probe_isect.t) > MNEE_MIN_DISTANCE) + if (hit_object != v.object || fabsf(probe_ray.tmax - probe_isect.t) > MNEE_MIN_DISTANCE) return false; } probe_ray.self.object = v.object; @@ -952,15 +959,16 @@ ccl_device_forceinline int kernel_path_mnee_sample(KernelGlobals kg, probe_ray.self.light_object = ls->object; probe_ray.self.light_prim = ls->prim; probe_ray.P = sd->P; + probe_ray.tmin = 0.0f; if (ls->t == FLT_MAX) { /* Distant / env light. */ probe_ray.D = ls->D; - probe_ray.t = ls->t; + probe_ray.tmax = ls->t; } else { /* Other lights, avoid self-intersection. */ probe_ray.D = ls->P - probe_ray.P; - probe_ray.D = normalize_len(probe_ray.D, &probe_ray.t); + probe_ray.D = normalize_len(probe_ray.D, &probe_ray.tmax); } probe_ray.dP = differential_make_compact(sd->dP); probe_ray.dD = differential_zero_compact(); @@ -1042,9 +1050,7 @@ ccl_device_forceinline int kernel_path_mnee_sample(KernelGlobals kg, probe_ray.self.object = probe_isect.object; probe_ray.self.prim = probe_isect.prim; - probe_ray.P += probe_isect.t * probe_ray.D; - if (ls->t != FLT_MAX) - probe_ray.t -= probe_isect.t; + probe_ray.tmin = intersection_t_offset(probe_isect.t); }; /* Mark the manifold walk invalid to keep mollification on by default. */ diff --git a/intern/cycles/kernel/integrator/path_state.h b/intern/cycles/kernel/integrator/path_state.h index 1a085506a70..912c380cdb6 100644 --- a/intern/cycles/kernel/integrator/path_state.h +++ b/intern/cycles/kernel/integrator/path_state.h @@ -52,7 +52,6 @@ ccl_device_inline void path_state_init_integrator(KernelGlobals kg, INTEGRATOR_STATE_WRITE(state, path, flag) = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP | PATH_RAY_TRANSPARENT_BACKGROUND; INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = 0.0f; - INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = 0.0f; INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = FLT_MAX; INTEGRATOR_STATE_WRITE(state, path, continuation_probability) = 1.0f; INTEGRATOR_STATE_WRITE(state, path, throughput) = make_float3(1.0f, 1.0f, 1.0f); diff --git a/intern/cycles/kernel/integrator/shade_background.h b/intern/cycles/kernel/integrator/shade_background.h index 4791a963ae6..a7edfffd175 100644 --- a/intern/cycles/kernel/integrator/shade_background.h +++ b/intern/cycles/kernel/integrator/shade_background.h @@ -62,11 +62,10 @@ ccl_device float3 integrator_eval_background_shader(KernelGlobals kg, const float3 ray_P = INTEGRATOR_STATE(state, ray, P); const float3 ray_D = INTEGRATOR_STATE(state, ray, D); const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf); - const float mis_ray_t = INTEGRATOR_STATE(state, path, mis_ray_t); /* multiple importance sampling, get background light pdf for ray * direction, and compute weight with respect to BSDF pdf */ - const float pdf = background_light_pdf(kg, ray_P - ray_D * mis_ray_t, ray_D); + const float pdf = background_light_pdf(kg, ray_P, ray_D); const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, pdf); L *= mis_weight; } @@ -213,7 +212,7 @@ ccl_device void integrator_shade_background(KernelGlobals kg, } #endif - INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); + integrator_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/integrator/shade_light.h b/intern/cycles/kernel/integrator/shade_light.h index be926c78439..910e3383f51 100644 --- a/intern/cycles/kernel/integrator/shade_light.h +++ b/intern/cycles/kernel/integrator/shade_light.h @@ -22,19 +22,8 @@ ccl_device_inline void integrate_light(KernelGlobals kg, const float3 ray_D = INTEGRATOR_STATE(state, ray, D); const float ray_time = INTEGRATOR_STATE(state, ray, time); - /* Advance ray beyond light. */ - /* TODO: can we make this more numerically robust to avoid reintersecting the - * same light in some cases? Ray should not intersect surface anymore as the - * object and prim ids will prevent self intersection. */ - const float3 new_ray_P = ray_P + ray_D * isect.t; - INTEGRATOR_STATE_WRITE(state, ray, P) = new_ray_P; - INTEGRATOR_STATE_WRITE(state, ray, t) -= isect.t; - - /* Set position to where the BSDF was sampled, for correct MIS PDF. */ - const float mis_ray_t = INTEGRATOR_STATE(state, path, mis_ray_t); - ray_P -= ray_D * mis_ray_t; - isect.t += mis_ray_t; - INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = isect.t; + /* Advance ray to new start distance. */ + INTEGRATOR_STATE_WRITE(state, ray, tmin) = intersection_t_offset(isect.t); LightSample ls ccl_optional_struct_init; const bool use_light_sample = light_sample_from_intersection(kg, &isect, ray_P, ray_D, &ls); @@ -99,11 +88,13 @@ ccl_device void integrator_shade_light(KernelGlobals kg, INTEGRATOR_STATE_WRITE(state, path, transparent_bounce) = transparent_bounce; if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) { - INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT); + integrator_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT); return; } else { - INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT, + integrator_path_next(kg, + state, + DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST); return; } diff --git a/intern/cycles/kernel/integrator/shade_shadow.h b/intern/cycles/kernel/integrator/shade_shadow.h index 2b929b7b62e..4b002a47bee 100644 --- a/intern/cycles/kernel/integrator/shade_shadow.h +++ b/intern/cycles/kernel/integrator/shade_shadow.h @@ -75,13 +75,9 @@ ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg, ray.self.light_object = OBJECT_NONE; ray.self.light_prim = PRIM_NONE; /* Modify ray position and length to match current segment. */ - const float start_t = (hit == 0) ? 0.0f : - INTEGRATOR_STATE_ARRAY(state, shadow_isect, hit - 1, t); - const float end_t = (hit < num_recorded_hits) ? - INTEGRATOR_STATE_ARRAY(state, shadow_isect, hit, t) : - ray.t; - ray.P += start_t * ray.D; - ray.t = end_t - start_t; + ray.tmin = (hit == 0) ? ray.tmin : INTEGRATOR_STATE_ARRAY(state, shadow_isect, hit - 1, t); + ray.tmax = (hit < num_recorded_hits) ? INTEGRATOR_STATE_ARRAY(state, shadow_isect, hit, t) : + ray.tmax; shader_setup_from_volume(kg, shadow_sd, &ray); @@ -137,10 +133,7 @@ ccl_device_inline bool integrate_transparent_shadow(KernelGlobals kg, /* There are more hits that we could not recorded due to memory usage, * adjust ray to intersect again from the last hit. */ const float last_hit_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, num_recorded_hits - 1, t); - const float3 ray_P = INTEGRATOR_STATE(state, shadow_ray, P); - const float3 ray_D = INTEGRATOR_STATE(state, shadow_ray, D); - INTEGRATOR_STATE_WRITE(state, shadow_ray, P) = ray_P + last_hit_t * ray_D; - INTEGRATOR_STATE_WRITE(state, shadow_ray, t) -= last_hit_t; + INTEGRATOR_STATE_WRITE(state, shadow_ray, tmin) = intersection_t_offset(last_hit_t); } return false; @@ -158,20 +151,22 @@ ccl_device void integrator_shade_shadow(KernelGlobals kg, /* Evaluate transparent shadows. */ const bool opaque = integrate_transparent_shadow(kg, state, num_hits); if (opaque) { - INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW); + integrator_shadow_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW); return; } #endif if (shadow_intersections_has_remaining(num_hits)) { /* More intersections to find, continue shadow ray. */ - INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW, + integrator_shadow_path_next(kg, + state, + DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW); return; } else { kernel_accum_light(kg, state, render_buffer); - INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW); + integrator_shadow_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW); return; } } diff --git a/intern/cycles/kernel/integrator/shade_surface.h b/intern/cycles/kernel/integrator/shade_surface.h index 57b88b806a4..1514b3956ad 100644 --- a/intern/cycles/kernel/integrator/shade_surface.h +++ b/intern/cycles/kernel/integrator/shade_surface.h @@ -77,7 +77,7 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg, # endif { const float bsdf_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf); - const float t = sd->ray_length + INTEGRATOR_STATE(state, path, mis_ray_t); + const float t = sd->ray_length; /* Multiple importance sampling, get triangle light pdf, * and compute weight with respect to BSDF pdf. */ @@ -190,8 +190,8 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg, const bool is_light = light_sample_is_light(&ls); /* Branch off shadow kernel. */ - INTEGRATOR_SHADOW_PATH_INIT( - shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow); + IntegratorShadowState shadow_state = integrator_shadow_path_init( + kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, false); /* Copy volume stack and enter/exit volume. */ integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state); @@ -323,16 +323,21 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce( return LABEL_NONE; } - /* Setup ray. Note that clipping works through transparent bounces. */ - INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P; - INTEGRATOR_STATE_WRITE(state, ray, D) = normalize(bsdf_omega_in); - INTEGRATOR_STATE_WRITE(state, ray, t) = (label & LABEL_TRANSPARENT) ? - INTEGRATOR_STATE(state, ray, t) - sd->ray_length : - FLT_MAX; + if (label & LABEL_TRANSPARENT) { + /* Only need to modify start distance for transparent. */ + INTEGRATOR_STATE_WRITE(state, ray, tmin) = intersection_t_offset(sd->ray_length); + } + else { + /* Setup ray with changed origin and direction. */ + INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P; + INTEGRATOR_STATE_WRITE(state, ray, D) = normalize(bsdf_omega_in); + INTEGRATOR_STATE_WRITE(state, ray, tmin) = 0.0f; + INTEGRATOR_STATE_WRITE(state, ray, tmax) = FLT_MAX; #ifdef __RAY_DIFFERENTIALS__ - INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP); - INTEGRATOR_STATE_WRITE(state, ray, dD) = differential_make_compact(bsdf_domega_in); + INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP); + INTEGRATOR_STATE_WRITE(state, ray, dD) = differential_make_compact(bsdf_domega_in); #endif + } /* Update throughput. */ float3 throughput = INTEGRATOR_STATE(state, path, throughput); @@ -349,12 +354,8 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce( } /* Update path state */ - if (label & LABEL_TRANSPARENT) { - INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) += sd->ray_length; - } - else { + if (!(label & LABEL_TRANSPARENT)) { INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = bsdf_pdf; - INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = 0.0f; INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = fminf( bsdf_pdf, INTEGRATOR_STATE(state, path, min_ray_pdf)); } @@ -371,17 +372,8 @@ ccl_device_forceinline int integrate_surface_volume_only_bounce(IntegratorState return LABEL_NONE; } - /* Setup ray position, direction stays unchanged. */ - INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P; - - /* Clipping works through transparent. */ - INTEGRATOR_STATE_WRITE(state, ray, t) -= sd->ray_length; - -# ifdef __RAY_DIFFERENTIALS__ - INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP); -# endif - - INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) += sd->ray_length; + /* Only modify start distance. */ + INTEGRATOR_STATE_WRITE(state, ray, tmin) = intersection_t_offset(sd->ray_length); return LABEL_TRANSMIT | LABEL_TRANSPARENT; } @@ -432,7 +424,8 @@ ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg, Ray ray ccl_optional_struct_init; ray.P = shadow_ray_offset(kg, sd, ao_D, &skip_self); ray.D = ao_D; - ray.t = kernel_data.integrator.ao_bounces_distance; + ray.tmin = 0.0f; + ray.tmax = kernel_data.integrator.ao_bounces_distance; ray.time = sd->time; ray.self.object = (skip_self) ? sd->object : OBJECT_NONE; ray.self.prim = (skip_self) ? sd->prim : PRIM_NONE; @@ -442,7 +435,8 @@ ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg, ray.dD = differential_zero_compact(); /* Branch off shadow kernel. */ - INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, ao); + IntegratorShadowState shadow_state = integrator_shadow_path_init( + kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, true); /* Copy volume stack and enter/exit volume. */ integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state); @@ -604,22 +598,23 @@ ccl_device bool integrate_surface(KernelGlobals kg, } template<uint node_feature_mask = KERNEL_FEATURE_NODE_MASK_SURFACE & ~KERNEL_FEATURE_NODE_RAYTRACE, - int current_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE> + DeviceKernel current_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE> ccl_device_forceinline void integrator_shade_surface(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer) { if (integrate_surface<node_feature_mask>(kg, state, render_buffer)) { if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_SUBSURFACE) { - INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE); + integrator_path_next( + kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE); } else { - kernel_assert(INTEGRATOR_STATE(state, ray, t) != 0.0f); - INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST); + kernel_assert(INTEGRATOR_STATE(state, ray, tmax) != 0.0f); + integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST); } } else { - INTEGRATOR_PATH_TERMINATE(current_kernel); + integrator_path_terminate(kg, state, current_kernel); } } diff --git a/intern/cycles/kernel/integrator/shade_volume.h b/intern/cycles/kernel/integrator/shade_volume.h index 6cf80f4ddc5..4aab097a7d8 100644 --- a/intern/cycles/kernel/integrator/shade_volume.h +++ b/intern/cycles/kernel/integrator/shade_volume.h @@ -114,7 +114,8 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals kg, ccl_device_forceinline void volume_step_init(KernelGlobals kg, ccl_private const RNGState *rng_state, const float object_step_size, - float t, + const float tmin, + const float tmax, ccl_private float *step_size, ccl_private float *step_shade_offset, ccl_private float *steps_offset, @@ -122,7 +123,7 @@ ccl_device_forceinline void volume_step_init(KernelGlobals kg, { if (object_step_size == FLT_MAX) { /* Homogeneous volume. */ - *step_size = t; + *step_size = tmax - tmin; *step_shade_offset = 0.0f; *steps_offset = 1.0f; *max_steps = 1; @@ -130,6 +131,7 @@ ccl_device_forceinline void volume_step_init(KernelGlobals kg, else { /* Heterogeneous volume. */ *max_steps = kernel_data.integrator.volume_max_steps; + const float t = tmax - tmin; float step = min(object_step_size, t); /* compute exact steps in advance for malloc */ @@ -165,7 +167,7 @@ ccl_device void volume_shadow_homogeneous(KernelGlobals kg, IntegratorState stat float3 sigma_t = zero_float3(); if (shadow_volume_shader_sample(kg, state, sd, &sigma_t)) { - *throughput *= volume_color_transmittance(sigma_t, ray->t); + *throughput *= volume_color_transmittance(sigma_t, ray->tmax - ray->tmin); } } # endif @@ -194,7 +196,8 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg, volume_step_init(kg, &rng_state, object_step_size, - ray->t, + ray->tmin, + ray->tmax, &step_size, &step_shade_offset, &unused, @@ -202,13 +205,13 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg, const float steps_offset = 1.0f; /* compute extinction at the start */ - float t = 0.0f; + float t = ray->tmin; float3 sum = zero_float3(); for (int i = 0; i < max_steps; i++) { /* advance to new position */ - float new_t = min(ray->t, (i + steps_offset) * step_size); + float new_t = min(ray->tmax, ray->tmin + (i + steps_offset) * step_size); float dt = new_t - t; float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset); @@ -233,7 +236,7 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg, /* stop if at the end of the volume */ t = new_t; - if (t == ray->t) { + if (t == ray->tmax) { /* Update throughput in case we haven't done it above */ tp = *throughput * exp(sum); break; @@ -257,15 +260,16 @@ ccl_device float volume_equiangular_sample(ccl_private const Ray *ccl_restrict r const float xi, ccl_private float *pdf) { - const float t = ray->t; + const float tmin = ray->tmin; + const float tmax = ray->tmax; const float delta = dot((light_P - ray->P), ray->D); const float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta); if (UNLIKELY(D == 0.0f)) { *pdf = 0.0f; return 0.0f; } - const float theta_a = -atan2f(delta, D); - const float theta_b = atan2f(t - delta, D); + const float theta_a = atan2f(tmin - delta, D); + const float theta_b = atan2f(tmax - delta, D); const float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a); if (UNLIKELY(theta_b == theta_a)) { *pdf = 0.0f; @@ -273,7 +277,7 @@ ccl_device float volume_equiangular_sample(ccl_private const Ray *ccl_restrict r } *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_)); - return min(t, delta + t_); /* min is only for float precision errors */ + return clamp(delta + t_, tmin, tmax); /* clamp is only for float precision errors */ } ccl_device float volume_equiangular_pdf(ccl_private const Ray *ccl_restrict ray, @@ -286,11 +290,12 @@ ccl_device float volume_equiangular_pdf(ccl_private const Ray *ccl_restrict ray, return 0.0f; } - const float t = ray->t; + const float tmin = ray->tmin; + const float tmax = ray->tmax; const float t_ = sample_t - delta; - const float theta_a = -atan2f(delta, D); - const float theta_b = atan2f(t - delta, D); + const float theta_a = atan2f(tmin - delta, D); + const float theta_b = atan2f(tmax - delta, D); if (UNLIKELY(theta_b == theta_a)) { return 0.0f; } @@ -310,11 +315,12 @@ ccl_device float volume_equiangular_cdf(ccl_private const Ray *ccl_restrict ray, return 0.0f; } - const float t = ray->t; + const float tmin = ray->tmin; + const float tmax = ray->tmax; const float t_ = sample_t - delta; - const float theta_a = -atan2f(delta, D); - const float theta_b = atan2f(t - delta, D); + const float theta_a = atan2f(tmin - delta, D); + const float theta_b = atan2f(tmax - delta, D); if (UNLIKELY(theta_b == theta_a)) { return 0.0f; } @@ -390,8 +396,8 @@ ccl_device float3 volume_emission_integrate(ccl_private VolumeShaderCoefficients typedef struct VolumeIntegrateState { /* Volume segment extents. */ - float start_t; - float end_t; + float tmin; + float tmax; /* If volume is absorption-only up to this point, and no probabilistic * scattering or termination has been used yet. */ @@ -426,9 +432,9 @@ ccl_device_forceinline void volume_integrate_step_scattering( /* Equiangular sampling for direct lighting. */ if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR && !result.direct_scatter) { - if (result.direct_t >= vstate.start_t && result.direct_t <= vstate.end_t && + if (result.direct_t >= vstate.tmin && result.direct_t <= vstate.tmax && vstate.equiangular_pdf > VOLUME_SAMPLE_PDF_CUTOFF) { - const float new_dt = result.direct_t - vstate.start_t; + const float new_dt = result.direct_t - vstate.tmin; const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt); result.direct_scatter = true; @@ -458,7 +464,7 @@ ccl_device_forceinline void volume_integrate_step_scattering( /* compute sampling distance */ const float sample_sigma_t = volume_channel_get(coeff.sigma_t, channel); const float new_dt = -logf(1.0f - vstate.rscatter) / sample_sigma_t; - const float new_t = vstate.start_t + new_dt; + const float new_t = vstate.tmin + new_dt; /* transmittance and pdf */ const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt); @@ -528,7 +534,8 @@ ccl_device_forceinline void volume_integrate_heterogeneous( volume_step_init(kg, rng_state, object_step_size, - ray->t, + ray->tmin, + ray->tmax, &step_size, &step_shade_offset, &steps_offset, @@ -536,8 +543,8 @@ ccl_device_forceinline void volume_integrate_heterogeneous( /* Initialize volume integration state. */ VolumeIntegrateState vstate ccl_optional_struct_init; - vstate.start_t = 0.0f; - vstate.end_t = 0.0f; + vstate.tmin = ray->tmin; + vstate.tmax = ray->tmin; vstate.absorption_only = true; vstate.rscatter = path_state_rng_1D(kg, rng_state, PRNG_SCATTER_DISTANCE); vstate.rphase = path_state_rng_1D(kg, rng_state, PRNG_PHASE_CHANNEL); @@ -578,8 +585,8 @@ ccl_device_forceinline void volume_integrate_heterogeneous( for (int i = 0; i < max_steps; i++) { /* Advance to new position */ - vstate.end_t = min(ray->t, (i + steps_offset) * step_size); - const float shade_t = vstate.start_t + (vstate.end_t - vstate.start_t) * step_shade_offset; + vstate.tmax = min(ray->tmax, ray->tmin + (i + steps_offset) * step_size); + const float shade_t = vstate.tmin + (vstate.tmax - vstate.tmin) * step_shade_offset; sd->P = ray->P + ray->D * shade_t; /* compute segment */ @@ -588,7 +595,7 @@ ccl_device_forceinline void volume_integrate_heterogeneous( const int closure_flag = sd->flag; /* Evaluate transmittance over segment. */ - const float dt = (vstate.end_t - vstate.start_t); + const float dt = (vstate.tmax - vstate.tmin); const float3 transmittance = (closure_flag & SD_EXTINCTION) ? volume_color_transmittance(coeff.sigma_t, dt) : one_float3(); @@ -645,8 +652,8 @@ ccl_device_forceinline void volume_integrate_heterogeneous( } /* Stop if at the end of the volume. */ - vstate.start_t = vstate.end_t; - if (vstate.start_t == ray->t) { + vstate.tmin = vstate.tmax; + if (vstate.tmin == ray->tmax) { break; } } @@ -774,8 +781,8 @@ ccl_device_forceinline void integrate_volume_direct_light( const bool is_light = light_sample_is_light(ls); /* Branch off shadow kernel. */ - INTEGRATOR_SHADOW_PATH_INIT( - shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow); + IntegratorShadowState shadow_state = integrator_shadow_path_init( + kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, false); /* Write shadow ray and associated state to global memory. */ integrator_state_write_shadow_ray(kg, shadow_state, &ray); @@ -880,7 +887,8 @@ ccl_device_forceinline bool integrate_volume_phase_scatter( /* Setup ray. */ INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P; INTEGRATOR_STATE_WRITE(state, ray, D) = normalize(phase_omega_in); - INTEGRATOR_STATE_WRITE(state, ray, t) = FLT_MAX; + INTEGRATOR_STATE_WRITE(state, ray, tmin) = 0.0f; + INTEGRATOR_STATE_WRITE(state, ray, tmax) = FLT_MAX; # ifdef __RAY_DIFFERENTIALS__ INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP); INTEGRATOR_STATE_WRITE(state, ray, dD) = differential_make_compact(phase_domega_in); @@ -901,7 +909,6 @@ ccl_device_forceinline bool integrate_volume_phase_scatter( /* Update path state */ INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = phase_pdf; - INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = 0.0f; INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = fminf( phase_pdf, INTEGRATOR_STATE(state, path, min_ray_pdf)); @@ -1021,7 +1028,7 @@ ccl_device void integrator_shade_volume(KernelGlobals kg, integrator_state_read_isect(kg, state, &isect); /* Set ray length to current segment. */ - ray.t = (isect.prim != PRIM_NONE) ? isect.t : FLT_MAX; + ray.tmax = (isect.prim != PRIM_NONE) ? isect.t : FLT_MAX; /* Clean volume stack for background rays. */ if (isect.prim == PRIM_NONE) { @@ -1032,13 +1039,15 @@ ccl_device void integrator_shade_volume(KernelGlobals kg, if (event == VOLUME_PATH_SCATTERED) { /* Queue intersect_closest kernel. */ - INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME, + integrator_path_next(kg, + state, + DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST); return; } else if (event == VOLUME_PATH_MISSED) { /* End path. */ - INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME); + integrator_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME); return; } else { diff --git a/intern/cycles/kernel/integrator/shadow_catcher.h b/intern/cycles/kernel/integrator/shadow_catcher.h index 42d44580f80..ff63625aceb 100644 --- a/intern/cycles/kernel/integrator/shadow_catcher.h +++ b/intern/cycles/kernel/integrator/shadow_catcher.h @@ -50,7 +50,7 @@ ccl_device_inline bool kernel_shadow_catcher_is_path_split_bounce(KernelGlobals ccl_device_inline bool kernel_shadow_catcher_path_can_split(KernelGlobals kg, ConstIntegratorState state) { - if (INTEGRATOR_PATH_IS_TERMINATED) { + if (integrator_path_is_terminated(state)) { return false; } diff --git a/intern/cycles/kernel/integrator/shadow_state_template.h b/intern/cycles/kernel/integrator/shadow_state_template.h index eaee65ada40..c340467606d 100644 --- a/intern/cycles/kernel/integrator/shadow_state_template.h +++ b/intern/cycles/kernel/integrator/shadow_state_template.h @@ -47,7 +47,8 @@ KERNEL_STRUCT_END(shadow_path) KERNEL_STRUCT_BEGIN(shadow_ray) KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(shadow_ray, float, tmin, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(shadow_ray, float, tmax, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(shadow_ray, int, object, KERNEL_FEATURE_PATH_TRACING) diff --git a/intern/cycles/kernel/integrator/state.h b/intern/cycles/kernel/integrator/state.h index d6fef27f344..d10d31e930e 100644 --- a/intern/cycles/kernel/integrator/state.h +++ b/intern/cycles/kernel/integrator/state.h @@ -127,6 +127,9 @@ typedef struct IntegratorStateGPU { /* Index of main path which will be used by a next shadow catcher split. */ ccl_global int *next_main_path_index; + + /* Divisor used to partition active indices by locality when sorting by material. */ + uint sort_partition_divisor; } IntegratorStateGPU; /* Abstraction diff --git a/intern/cycles/kernel/integrator/state_flow.h b/intern/cycles/kernel/integrator/state_flow.h index fed74d49434..4b03c665e17 100644 --- a/intern/cycles/kernel/integrator/state_flow.h +++ b/intern/cycles/kernel/integrator/state_flow.h @@ -10,125 +10,196 @@ CCL_NAMESPACE_BEGIN /* Control Flow * - * Utilities for control flow between kernels. The implementation may differ per device - * or even be handled on the host side. To abstract such differences, experiment with - * different implementations and for debugging, this is abstracted using macros. + * Utilities for control flow between kernels. The implementation is different between CPU and + * GPU devices. For the latter part of the logic is handled on the host side with wavefronts. * * There is a main path for regular path tracing camera for path tracing. Shadows for next * event estimation branch off from this into their own path, that may be computed in - * parallel while the main path continues. + * parallel while the main path continues. Additionally, shading kernels are sorted using + * a key for coherence. * * Each kernel on the main path must call one of these functions. These may not be called * multiple times from the same kernel. * - * INTEGRATOR_PATH_INIT(next_kernel) - * INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) - * INTEGRATOR_PATH_TERMINATE(current_kernel) + * integrator_path_init(kg, state, next_kernel) + * integrator_path_next(kg, state, current_kernel, next_kernel) + * integrator_path_terminate(kg, state, current_kernel) * * For the shadow path similar functions are used, and again each shadow kernel must call * one of them, and only once. */ -#define INTEGRATOR_PATH_IS_TERMINATED (INTEGRATOR_STATE(state, path, queued_kernel) == 0) -#define INTEGRATOR_SHADOW_PATH_IS_TERMINATED \ - (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0) +ccl_device_forceinline bool integrator_path_is_terminated(ConstIntegratorState state) +{ + return INTEGRATOR_STATE(state, path, queued_kernel) == 0; +} + +ccl_device_forceinline bool integrator_shadow_path_is_terminated(ConstIntegratorShadowState state) +{ + return INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0; +} #ifdef __KERNEL_GPU__ -# define INTEGRATOR_PATH_INIT(next_kernel) \ - atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \ - 1); \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; -# define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \ - atomic_fetch_and_sub_uint32( \ - &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ - atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \ - 1); \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; -# define INTEGRATOR_PATH_TERMINATE(current_kernel) \ - atomic_fetch_and_sub_uint32( \ - &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; - -# define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \ - IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32( \ - &kernel_integrator_state.next_shadow_path_index[0], 1); \ - atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \ - 1); \ - INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel; -# define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \ - atomic_fetch_and_sub_uint32( \ - &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ - atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \ - 1); \ - INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel; -# define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \ - atomic_fetch_and_sub_uint32( \ - &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ - INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; - -# define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \ - { \ - const int key_ = key; \ - atomic_fetch_and_add_uint32( \ - &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \ - INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_; \ - atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \ - 1); \ - } -# define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \ - { \ - const int key_ = key; \ - atomic_fetch_and_sub_uint32( \ - &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ - atomic_fetch_and_add_uint32( \ - &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \ - INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_; \ - atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \ - 1); \ - } +ccl_device_forceinline void integrator_path_init(KernelGlobals kg, + IntegratorState state, + const DeviceKernel next_kernel) +{ + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; +} + +ccl_device_forceinline void integrator_path_next(KernelGlobals kg, + IntegratorState state, + const DeviceKernel current_kernel, + const DeviceKernel next_kernel) +{ + atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel], + 1); + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; +} + +ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg, + IntegratorState state, + const DeviceKernel current_kernel) +{ + atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel], + 1); + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; +} + +ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init( + KernelGlobals kg, IntegratorState state, const DeviceKernel next_kernel, const bool is_ao) +{ + IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32( + &kernel_integrator_state.next_shadow_path_index[0], 1); + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); + INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel; + return shadow_state; +} + +ccl_device_forceinline void integrator_shadow_path_next(KernelGlobals kg, + IntegratorShadowState state, + const DeviceKernel current_kernel, + const DeviceKernel next_kernel) +{ + atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel], + 1); + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); + INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel; +} + +ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg, + IntegratorShadowState state, + const DeviceKernel current_kernel) +{ + atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel], + 1); + INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; +} + +/* Sort first by truncated state index (for good locality), then by key (for good coherence). */ +# define INTEGRATOR_SORT_KEY(key, state) \ + (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor)) + +ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg, + IntegratorState state, + const DeviceKernel next_kernel, + const uint32_t key) +{ + const int key_ = INTEGRATOR_SORT_KEY(key, state); + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; + INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_; + atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1); +} + +ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg, + IntegratorState state, + const DeviceKernel current_kernel, + const DeviceKernel next_kernel, + const uint32_t key) +{ + const int key_ = INTEGRATOR_SORT_KEY(key, state); + atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel], + 1); + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; + INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_; + atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1); +} #else -# define INTEGRATOR_PATH_INIT(next_kernel) \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; -# define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \ - { \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \ - (void)key; \ - } -# define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \ - { \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \ - (void)current_kernel; \ - } -# define INTEGRATOR_PATH_TERMINATE(current_kernel) \ - { \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; \ - (void)current_kernel; \ - } -# define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \ - { \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \ - (void)key; \ - (void)current_kernel; \ - } - -# define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \ - IntegratorShadowState shadow_state = &state->shadow_type; \ - INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel; -# define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \ - { \ - INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel; \ - (void)current_kernel; \ - } -# define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \ - { \ - INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; \ - (void)current_kernel; \ - } +ccl_device_forceinline void integrator_path_init(KernelGlobals kg, + IntegratorState state, + const DeviceKernel next_kernel) +{ + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; +} + +ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg, + IntegratorState state, + const DeviceKernel next_kernel, + const uint32_t key) +{ + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; + (void)key; +} + +ccl_device_forceinline void integrator_path_next(KernelGlobals kg, + IntegratorState state, + const DeviceKernel current_kernel, + const DeviceKernel next_kernel) +{ + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; + (void)current_kernel; +} + +ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg, + IntegratorState state, + const DeviceKernel current_kernel) +{ + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; + (void)current_kernel; +} + +ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg, + IntegratorState state, + const DeviceKernel current_kernel, + const DeviceKernel next_kernel, + const uint32_t key) +{ + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; + (void)key; + (void)current_kernel; +} + +ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init( + KernelGlobals kg, IntegratorState state, const DeviceKernel next_kernel, const bool is_ao) +{ + IntegratorShadowState shadow_state = (is_ao) ? &state->ao : &state->shadow; + INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel; + return shadow_state; +} + +ccl_device_forceinline void integrator_shadow_path_next(KernelGlobals kg, + IntegratorShadowState state, + const DeviceKernel current_kernel, + const DeviceKernel next_kernel) +{ + INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel; + (void)current_kernel; +} + +ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg, + IntegratorShadowState state, + const DeviceKernel current_kernel) +{ + INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; + (void)current_kernel; +} #endif diff --git a/intern/cycles/kernel/integrator/state_template.h b/intern/cycles/kernel/integrator/state_template.h index e7e6db037b0..5c2af131945 100644 --- a/intern/cycles/kernel/integrator/state_template.h +++ b/intern/cycles/kernel/integrator/state_template.h @@ -37,11 +37,10 @@ KERNEL_STRUCT_MEMBER(path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING) /* enum PathRayMNEE */ KERNEL_STRUCT_MEMBER(path, uint8_t, mnee, KERNEL_FEATURE_PATH_TRACING) /* Multiple importance sampling - * The PDF of BSDF sampling at the last scatter point, and distance to the - * last scatter point minus the last ray segment. This distance lets us - * compute the complete distance through transparent surfaces and volumes. */ + * The PDF of BSDF sampling at the last scatter point, which is at ray distance + * zero and distance. Note that transparency and volume attenuation increase + * the ray tmin but keep P unmodified so that this works. */ KERNEL_STRUCT_MEMBER(path, float, mis_ray_pdf, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(path, float, mis_ray_t, KERNEL_FEATURE_PATH_TRACING) /* Filter glossy. */ KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING) /* Continuation probability for path termination. */ @@ -63,7 +62,8 @@ KERNEL_STRUCT_END(path) KERNEL_STRUCT_BEGIN(ray) KERNEL_STRUCT_MEMBER(ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(ray, float, t, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(ray, float, tmin, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(ray, float, tmax, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(ray, float, dD, KERNEL_FEATURE_PATH_TRACING) diff --git a/intern/cycles/kernel/integrator/state_util.h b/intern/cycles/kernel/integrator/state_util.h index 280db2d1aac..8dd58ad6bcd 100644 --- a/intern/cycles/kernel/integrator/state_util.h +++ b/intern/cycles/kernel/integrator/state_util.h @@ -17,7 +17,8 @@ ccl_device_forceinline void integrator_state_write_ray(KernelGlobals kg, { INTEGRATOR_STATE_WRITE(state, ray, P) = ray->P; INTEGRATOR_STATE_WRITE(state, ray, D) = ray->D; - INTEGRATOR_STATE_WRITE(state, ray, t) = ray->t; + INTEGRATOR_STATE_WRITE(state, ray, tmin) = ray->tmin; + INTEGRATOR_STATE_WRITE(state, ray, tmax) = ray->tmax; INTEGRATOR_STATE_WRITE(state, ray, time) = ray->time; INTEGRATOR_STATE_WRITE(state, ray, dP) = ray->dP; INTEGRATOR_STATE_WRITE(state, ray, dD) = ray->dD; @@ -29,7 +30,8 @@ ccl_device_forceinline void integrator_state_read_ray(KernelGlobals kg, { ray->P = INTEGRATOR_STATE(state, ray, P); ray->D = INTEGRATOR_STATE(state, ray, D); - ray->t = INTEGRATOR_STATE(state, ray, t); + ray->tmin = INTEGRATOR_STATE(state, ray, tmin); + ray->tmax = INTEGRATOR_STATE(state, ray, tmax); ray->time = INTEGRATOR_STATE(state, ray, time); ray->dP = INTEGRATOR_STATE(state, ray, dP); ray->dD = INTEGRATOR_STATE(state, ray, dD); @@ -42,7 +44,8 @@ ccl_device_forceinline void integrator_state_write_shadow_ray( { INTEGRATOR_STATE_WRITE(state, shadow_ray, P) = ray->P; INTEGRATOR_STATE_WRITE(state, shadow_ray, D) = ray->D; - INTEGRATOR_STATE_WRITE(state, shadow_ray, t) = ray->t; + INTEGRATOR_STATE_WRITE(state, shadow_ray, tmin) = ray->tmin; + INTEGRATOR_STATE_WRITE(state, shadow_ray, tmax) = ray->tmax; INTEGRATOR_STATE_WRITE(state, shadow_ray, time) = ray->time; INTEGRATOR_STATE_WRITE(state, shadow_ray, dP) = ray->dP; } @@ -53,7 +56,8 @@ ccl_device_forceinline void integrator_state_read_shadow_ray(KernelGlobals kg, { ray->P = INTEGRATOR_STATE(state, shadow_ray, P); ray->D = INTEGRATOR_STATE(state, shadow_ray, D); - ray->t = INTEGRATOR_STATE(state, shadow_ray, t); + ray->tmin = INTEGRATOR_STATE(state, shadow_ray, tmin); + ray->tmax = INTEGRATOR_STATE(state, shadow_ray, tmax); ray->time = INTEGRATOR_STATE(state, shadow_ray, time); ray->dP = INTEGRATOR_STATE(state, shadow_ray, dP); ray->dD = differential_zero_compact(); diff --git a/intern/cycles/kernel/integrator/subsurface.h b/intern/cycles/kernel/integrator/subsurface.h index 1e6fcf4aff0..2f96f215d8a 100644 --- a/intern/cycles/kernel/integrator/subsurface.h +++ b/intern/cycles/kernel/integrator/subsurface.h @@ -38,7 +38,8 @@ ccl_device int subsurface_bounce(KernelGlobals kg, /* Setup ray into surface. */ INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P; INTEGRATOR_STATE_WRITE(state, ray, D) = bssrdf->N; - INTEGRATOR_STATE_WRITE(state, ray, t) = FLT_MAX; + INTEGRATOR_STATE_WRITE(state, ray, tmin) = 0.0f; + INTEGRATOR_STATE_WRITE(state, ray, tmax) = FLT_MAX; INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP); INTEGRATOR_STATE_WRITE(state, ray, dD) = differential_zero_compact(); @@ -160,7 +161,7 @@ ccl_device_inline bool subsurface_scatter(KernelGlobals kg, IntegratorState stat /* Pretend ray is coming from the outside towards the exit point. This ensures * correct front/back facing normals. * TODO: find a more elegant solution? */ - ray.P += ray.D * ray.t * 2.0f; + ray.P += ray.D * ray.tmax * 2.0f; ray.D = -ray.D; integrator_state_write_isect(kg, state, &ss_isect.hits[0]); @@ -177,17 +178,23 @@ ccl_device_inline bool subsurface_scatter(KernelGlobals kg, IntegratorState stat const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE); if (use_caustics) { - INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE, + integrator_path_next_sorted(kg, + state, + DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader); } else if (use_raytrace_kernel) { - INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE, + integrator_path_next_sorted(kg, + state, + DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader); } else { - INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE, + integrator_path_next_sorted(kg, + state, + DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader); } diff --git a/intern/cycles/kernel/integrator/subsurface_disk.h b/intern/cycles/kernel/integrator/subsurface_disk.h index ae857c50493..2836934f6dd 100644 --- a/intern/cycles/kernel/integrator/subsurface_disk.h +++ b/intern/cycles/kernel/integrator/subsurface_disk.h @@ -82,7 +82,8 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg, /* Create ray. */ ray.P = P + disk_N * disk_height + disk_P; ray.D = -disk_N; - ray.t = 2.0f * disk_height; + ray.tmin = 0.0f; + ray.tmax = 2.0f * disk_height; ray.dP = ray_dP; ray.dD = differential_zero_compact(); ray.time = time; @@ -188,7 +189,8 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg, ray.P = ray.P + ray.D * ss_isect.hits[hit].t; ray.D = ss_isect.Ng[hit]; - ray.t = 1.0f; + ray.tmin = 0.0f; + ray.tmax = 1.0f; return true; } diff --git a/intern/cycles/kernel/integrator/subsurface_random_walk.h b/intern/cycles/kernel/integrator/subsurface_random_walk.h index 8094bf7159e..c1691030817 100644 --- a/intern/cycles/kernel/integrator/subsurface_random_walk.h +++ b/intern/cycles/kernel/integrator/subsurface_random_walk.h @@ -195,7 +195,8 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg, /* Setup ray. */ ray.P = P; ray.D = D; - ray.t = FLT_MAX; + ray.tmin = 0.0f; + ray.tmax = FLT_MAX; ray.time = time; ray.dP = ray_dP; ray.dD = differential_zero_compact(); @@ -370,10 +371,10 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg, * chance of connecting to it. * TODO: Maybe use less than 10 times the mean free path? */ if (bounce == 0) { - ray.t = max(t, 10.0f / (reduce_min(sigma_t))); + ray.tmax = max(t, 10.0f / (reduce_min(sigma_t))); } else { - ray.t = t; + ray.tmax = t; /* After the first bounce the object can intersect the same surface again */ ray.self.object = OBJECT_NONE; ray.self.prim = PRIM_NONE; @@ -384,12 +385,12 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg, if (hit) { #ifdef __KERNEL_GPU_RAYTRACING__ /* t is always in world space with OptiX and MetalRT. */ - ray.t = ss_isect.hits[0].t; + ray.tmax = ss_isect.hits[0].t; #else /* Compute world space distance to surface hit. */ float3 D = transform_direction(&ob_itfm, ray.D); D = normalize(D) * ss_isect.hits[0].t; - ray.t = len(transform_direction(&ob_tfm, D)); + ray.tmax = len(transform_direction(&ob_tfm, D)); #endif } @@ -397,16 +398,16 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg, /* Check if we hit the opposite side. */ if (hit) { have_opposite_interface = true; - opposite_distance = dot(ray.P + ray.t * ray.D - P, -N); + opposite_distance = dot(ray.P + ray.tmax * ray.D - P, -N); } /* Apart from the opposite side check, we were supposed to only trace up to distance t, * so check if there would have been a hit in that case. */ - hit = ray.t < t; + hit = ray.tmax < t; } /* Use the distance to the exit point for the throughput update if we found one. */ if (hit) { - t = ray.t; + t = ray.tmax; } /* Advance to new scatter location. */ diff --git a/intern/cycles/kernel/light/light.h b/intern/cycles/kernel/light/light.h index 1e7a333d013..b939489bb18 100644 --- a/intern/cycles/kernel/light/light.h +++ b/intern/cycles/kernel/light/light.h @@ -270,31 +270,26 @@ ccl_device bool lights_intersect(KernelGlobals kg, if (type == LIGHT_SPOT) { /* Spot/Disk light. */ - const float mis_ray_t = INTEGRATOR_STATE(state, path, mis_ray_t); - const float3 ray_P = ray->P - ray->D * mis_ray_t; - const float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]); const float radius = klight->spot.radius; if (radius == 0.0f) { continue; } /* disk oriented normal */ - const float3 lightN = normalize(ray_P - lightP); + const float3 lightN = normalize(ray->P - lightP); /* One sided. */ if (dot(ray->D, lightN) >= 0.0f) { continue; } float3 P; - if (!ray_disk_intersect(ray->P, ray->D, ray->t, lightP, lightN, radius, &P, &t)) { + if (!ray_disk_intersect( + ray->P, ray->D, ray->tmin, ray->tmax, lightP, lightN, radius, &P, &t)) { continue; } } else if (type == LIGHT_POINT) { /* Sphere light (aka, aligned disk light). */ - const float mis_ray_t = INTEGRATOR_STATE(state, path, mis_ray_t); - const float3 ray_P = ray->P - ray->D * mis_ray_t; - const float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]); const float radius = klight->spot.radius; if (radius == 0.0f) { @@ -302,9 +297,10 @@ ccl_device bool lights_intersect(KernelGlobals kg, } /* disk oriented normal */ - const float3 lightN = normalize(ray_P - lightP); + const float3 lightN = normalize(ray->P - lightP); float3 P; - if (!ray_disk_intersect(ray->P, ray->D, ray->t, lightP, lightN, radius, &P, &t)) { + if (!ray_disk_intersect( + ray->P, ray->D, ray->tmin, ray->tmax, lightP, lightN, radius, &P, &t)) { continue; } } @@ -330,8 +326,19 @@ ccl_device bool lights_intersect(KernelGlobals kg, const float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]); float3 P; - if (!ray_quad_intersect( - ray->P, ray->D, 0.0f, ray->t, light_P, axisu, axisv, Ng, &P, &t, &u, &v, is_round)) { + if (!ray_quad_intersect(ray->P, + ray->D, + ray->tmin, + ray->tmax, + light_P, + axisu, + axisv, + Ng, + &P, + &t, + &u, + &v, + is_round)) { continue; } } @@ -775,7 +782,8 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals kg, ls->D = z * B + safe_sqrtf(1.0f - z * z) * safe_normalize(C_ - dot(C_, B) * B); /* calculate intersection with the planar triangle */ - if (!ray_triangle_intersect(P, ls->D, FLT_MAX, V[0], V[1], V[2], &ls->u, &ls->v, &ls->t)) { + if (!ray_triangle_intersect( + P, ls->D, 0.0f, FLT_MAX, V[0], V[1], V[2], &ls->u, &ls->v, &ls->t)) { ls->pdf = 0.0f; return; } diff --git a/intern/cycles/kernel/light/sample.h b/intern/cycles/kernel/light/sample.h index 5cf7dce683a..210bb1b35c2 100644 --- a/intern/cycles/kernel/light/sample.h +++ b/intern/cycles/kernel/light/sample.h @@ -227,23 +227,24 @@ ccl_device_inline void shadow_ray_setup(ccl_private const ShaderData *ccl_restri if (ls->shader & SHADER_CAST_SHADOW) { /* setup ray */ ray->P = P; + ray->tmin = 0.0f; if (ls->t == FLT_MAX) { /* distant light */ ray->D = ls->D; - ray->t = ls->t; + ray->tmax = ls->t; } else { /* other lights, avoid self-intersection */ ray->D = ls->P - P; - ray->D = normalize_len(ray->D, &ray->t); + ray->D = normalize_len(ray->D, &ray->tmax); } } else { /* signal to not cast shadow ray */ ray->P = zero_float3(); ray->D = zero_float3(); - ray->t = 0.0f; + ray->tmax = 0.0f; } ray->dP = differential_make_compact(sd->dP); diff --git a/intern/cycles/kernel/osl/services.cpp b/intern/cycles/kernel/osl/services.cpp index 78c23b858c4..6b7981b7f3a 100644 --- a/intern/cycles/kernel/osl/services.cpp +++ b/intern/cycles/kernel/osl/services.cpp @@ -1094,10 +1094,8 @@ bool OSLRenderServices::get_background_attribute(const KernelGlobalsCPU *kg, ndc[0] = camera_world_to_ndc(kg, sd, sd->ray_P); if (derivatives) { - ndc[1] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f)) - - ndc[0]; - ndc[2] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f)) - - ndc[0]; + ndc[1] = zero_float3(); + ndc[2] = zero_float3(); } } else { @@ -1671,7 +1669,8 @@ bool OSLRenderServices::trace(TraceOpt &options, ray.P = TO_FLOAT3(P); ray.D = TO_FLOAT3(R); - ray.t = (options.maxdist == 1.0e30f) ? FLT_MAX : options.maxdist - options.mindist; + ray.tmin = 0.0f; + ray.tmax = (options.maxdist == 1.0e30f) ? FLT_MAX : options.maxdist - options.mindist; ray.time = sd->time; ray.self.object = OBJECT_NONE; ray.self.prim = PRIM_NONE; diff --git a/intern/cycles/kernel/svm/ao.h b/intern/cycles/kernel/svm/ao.h index e66c535824c..c57c68d6230 100644 --- a/intern/cycles/kernel/svm/ao.h +++ b/intern/cycles/kernel/svm/ao.h @@ -59,7 +59,8 @@ ccl_device float svm_ao( Ray ray; ray.P = sd->P; ray.D = D.x * T + D.y * B + D.z * N; - ray.t = max_dist; + ray.tmin = 0.0f; + ray.tmax = max_dist; ray.time = sd->time; ray.self.object = sd->object; ray.self.prim = sd->prim; diff --git a/intern/cycles/kernel/svm/bevel.h b/intern/cycles/kernel/svm/bevel.h index 790437d8e82..4617a056a52 100644 --- a/intern/cycles/kernel/svm/bevel.h +++ b/intern/cycles/kernel/svm/bevel.h @@ -179,7 +179,8 @@ ccl_device float3 svm_bevel( Ray ray ccl_optional_struct_init; ray.P = sd->P + disk_N * disk_height + disk_P; ray.D = -disk_N; - ray.t = 2.0f * disk_height; + ray.tmin = 0.0f; + ray.tmax = 2.0f * disk_height; ray.dP = differential_zero_compact(); ray.dD = differential_zero_compact(); ray.time = sd->time; diff --git a/intern/cycles/kernel/svm/node_types_template.h b/intern/cycles/kernel/svm/node_types_template.h new file mode 100644 index 00000000000..39d279be4cb --- /dev/null +++ b/intern/cycles/kernel/svm/node_types_template.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2011-2022 Blender Foundation */ + +#ifndef SHADER_NODE_TYPE +# define SHADER_NODE_TYPE(name) +#endif + +/* NOTE: for best OpenCL performance, item definition in the enum must + * match the switch case order in `svm.h`. */ + +SHADER_NODE_TYPE(NODE_END) +SHADER_NODE_TYPE(NODE_SHADER_JUMP) +SHADER_NODE_TYPE(NODE_CLOSURE_BSDF) +SHADER_NODE_TYPE(NODE_CLOSURE_EMISSION) +SHADER_NODE_TYPE(NODE_CLOSURE_BACKGROUND) +SHADER_NODE_TYPE(NODE_CLOSURE_SET_WEIGHT) +SHADER_NODE_TYPE(NODE_CLOSURE_WEIGHT) +SHADER_NODE_TYPE(NODE_EMISSION_WEIGHT) +SHADER_NODE_TYPE(NODE_MIX_CLOSURE) +SHADER_NODE_TYPE(NODE_JUMP_IF_ZERO) +SHADER_NODE_TYPE(NODE_JUMP_IF_ONE) +SHADER_NODE_TYPE(NODE_GEOMETRY) +SHADER_NODE_TYPE(NODE_CONVERT) +SHADER_NODE_TYPE(NODE_TEX_COORD) +SHADER_NODE_TYPE(NODE_VALUE_F) +SHADER_NODE_TYPE(NODE_VALUE_V) +SHADER_NODE_TYPE(NODE_ATTR) +SHADER_NODE_TYPE(NODE_VERTEX_COLOR) +SHADER_NODE_TYPE(NODE_GEOMETRY_BUMP_DX) +SHADER_NODE_TYPE(NODE_GEOMETRY_BUMP_DY) +SHADER_NODE_TYPE(NODE_SET_DISPLACEMENT) +SHADER_NODE_TYPE(NODE_DISPLACEMENT) +SHADER_NODE_TYPE(NODE_VECTOR_DISPLACEMENT) +SHADER_NODE_TYPE(NODE_TEX_IMAGE) +SHADER_NODE_TYPE(NODE_TEX_IMAGE_BOX) +SHADER_NODE_TYPE(NODE_TEX_NOISE) +SHADER_NODE_TYPE(NODE_SET_BUMP) +SHADER_NODE_TYPE(NODE_ATTR_BUMP_DX) +SHADER_NODE_TYPE(NODE_ATTR_BUMP_DY) +SHADER_NODE_TYPE(NODE_VERTEX_COLOR_BUMP_DX) +SHADER_NODE_TYPE(NODE_VERTEX_COLOR_BUMP_DY) +SHADER_NODE_TYPE(NODE_TEX_COORD_BUMP_DX) +SHADER_NODE_TYPE(NODE_TEX_COORD_BUMP_DY) +SHADER_NODE_TYPE(NODE_CLOSURE_SET_NORMAL) +SHADER_NODE_TYPE(NODE_ENTER_BUMP_EVAL) +SHADER_NODE_TYPE(NODE_LEAVE_BUMP_EVAL) +SHADER_NODE_TYPE(NODE_HSV) +SHADER_NODE_TYPE(NODE_CLOSURE_HOLDOUT) +SHADER_NODE_TYPE(NODE_FRESNEL) +SHADER_NODE_TYPE(NODE_LAYER_WEIGHT) +SHADER_NODE_TYPE(NODE_CLOSURE_VOLUME) +SHADER_NODE_TYPE(NODE_PRINCIPLED_VOLUME) +SHADER_NODE_TYPE(NODE_MATH) +SHADER_NODE_TYPE(NODE_VECTOR_MATH) +SHADER_NODE_TYPE(NODE_RGB_RAMP) +SHADER_NODE_TYPE(NODE_GAMMA) +SHADER_NODE_TYPE(NODE_BRIGHTCONTRAST) +SHADER_NODE_TYPE(NODE_LIGHT_PATH) +SHADER_NODE_TYPE(NODE_OBJECT_INFO) +SHADER_NODE_TYPE(NODE_PARTICLE_INFO) +SHADER_NODE_TYPE(NODE_HAIR_INFO) +SHADER_NODE_TYPE(NODE_POINT_INFO) +SHADER_NODE_TYPE(NODE_TEXTURE_MAPPING) +SHADER_NODE_TYPE(NODE_MAPPING) +SHADER_NODE_TYPE(NODE_MIN_MAX) +SHADER_NODE_TYPE(NODE_CAMERA) +SHADER_NODE_TYPE(NODE_TEX_ENVIRONMENT) +SHADER_NODE_TYPE(NODE_TEX_SKY) +SHADER_NODE_TYPE(NODE_TEX_GRADIENT) +SHADER_NODE_TYPE(NODE_TEX_VORONOI) +SHADER_NODE_TYPE(NODE_TEX_MUSGRAVE) +SHADER_NODE_TYPE(NODE_TEX_WAVE) +SHADER_NODE_TYPE(NODE_TEX_MAGIC) +SHADER_NODE_TYPE(NODE_TEX_CHECKER) +SHADER_NODE_TYPE(NODE_TEX_BRICK) +SHADER_NODE_TYPE(NODE_TEX_WHITE_NOISE) +SHADER_NODE_TYPE(NODE_NORMAL) +SHADER_NODE_TYPE(NODE_LIGHT_FALLOFF) +SHADER_NODE_TYPE(NODE_IES) +SHADER_NODE_TYPE(NODE_CURVES) +SHADER_NODE_TYPE(NODE_TANGENT) +SHADER_NODE_TYPE(NODE_NORMAL_MAP) +SHADER_NODE_TYPE(NODE_INVERT) +SHADER_NODE_TYPE(NODE_MIX) +SHADER_NODE_TYPE(NODE_SEPARATE_COLOR) +SHADER_NODE_TYPE(NODE_COMBINE_COLOR) +SHADER_NODE_TYPE(NODE_SEPARATE_VECTOR) +SHADER_NODE_TYPE(NODE_COMBINE_VECTOR) +SHADER_NODE_TYPE(NODE_SEPARATE_HSV) +SHADER_NODE_TYPE(NODE_COMBINE_HSV) +SHADER_NODE_TYPE(NODE_VECTOR_ROTATE) +SHADER_NODE_TYPE(NODE_VECTOR_TRANSFORM) +SHADER_NODE_TYPE(NODE_WIREFRAME) +SHADER_NODE_TYPE(NODE_WAVELENGTH) +SHADER_NODE_TYPE(NODE_BLACKBODY) +SHADER_NODE_TYPE(NODE_MAP_RANGE) +SHADER_NODE_TYPE(NODE_VECTOR_MAP_RANGE) +SHADER_NODE_TYPE(NODE_CLAMP) +SHADER_NODE_TYPE(NODE_BEVEL) +SHADER_NODE_TYPE(NODE_AMBIENT_OCCLUSION) +SHADER_NODE_TYPE(NODE_TEX_VOXEL) +SHADER_NODE_TYPE(NODE_AOV_START) +SHADER_NODE_TYPE(NODE_AOV_COLOR) +SHADER_NODE_TYPE(NODE_AOV_VALUE) +SHADER_NODE_TYPE(NODE_FLOAT_CURVE) + +/* Padding for struct alignment. */ +SHADER_NODE_TYPE(NODE_PAD1) + +#undef SHADER_NODE_TYPE diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h index 8fd41ec8531..9d6d3e9222c 100644 --- a/intern/cycles/kernel/svm/svm.h +++ b/intern/cycles/kernel/svm/svm.h @@ -204,6 +204,15 @@ CCL_NAMESPACE_END CCL_NAMESPACE_BEGIN +#ifdef __KERNEL_USE_DATA_CONSTANTS__ +# define SVM_CASE(node) \ + case node: \ + if (!kernel_data_svm_usage_##node) \ + break; +#else +# define SVM_CASE(node) case node: +#endif + /* Main Interpreter Loop */ template<uint node_feature_mask, ShaderType type, typename ConstIntegratorGenericState> ccl_device void svm_eval_nodes(KernelGlobals kg, @@ -219,9 +228,10 @@ ccl_device void svm_eval_nodes(KernelGlobals kg, uint4 node = read_node(kg, &offset); switch (node.x) { - case NODE_END: - return; - case NODE_SHADER_JUMP: { + SVM_CASE(NODE_END) + return; + SVM_CASE(NODE_SHADER_JUMP) + { if (type == SHADER_TYPE_SURFACE) offset = node.y; else if (type == SHADER_TYPE_VOLUME) @@ -232,351 +242,349 @@ ccl_device void svm_eval_nodes(KernelGlobals kg, return; break; } - case NODE_CLOSURE_BSDF: - offset = svm_node_closure_bsdf<node_feature_mask, type>( - kg, sd, stack, node, path_flag, offset); - break; - case NODE_CLOSURE_EMISSION: - IF_KERNEL_NODES_FEATURE(EMISSION) - { - svm_node_closure_emission(sd, stack, node); - } - break; - case NODE_CLOSURE_BACKGROUND: - IF_KERNEL_NODES_FEATURE(EMISSION) - { - svm_node_closure_background(sd, stack, node); - } - break; - case NODE_CLOSURE_SET_WEIGHT: - svm_node_closure_set_weight(sd, node.y, node.z, node.w); - break; - case NODE_CLOSURE_WEIGHT: - svm_node_closure_weight(sd, stack, node.y); - break; - case NODE_EMISSION_WEIGHT: - IF_KERNEL_NODES_FEATURE(EMISSION) - { - svm_node_emission_weight(kg, sd, stack, node); - } - break; - case NODE_MIX_CLOSURE: - svm_node_mix_closure(sd, stack, node); - break; - case NODE_JUMP_IF_ZERO: - if (stack_load_float(stack, node.z) <= 0.0f) - offset += node.y; - break; - case NODE_JUMP_IF_ONE: - if (stack_load_float(stack, node.z) >= 1.0f) - offset += node.y; - break; - case NODE_GEOMETRY: - svm_node_geometry(kg, sd, stack, node.y, node.z); - break; - case NODE_CONVERT: - svm_node_convert(kg, sd, stack, node.y, node.z, node.w); - break; - case NODE_TEX_COORD: - offset = svm_node_tex_coord(kg, sd, path_flag, stack, node, offset); - break; - case NODE_VALUE_F: - svm_node_value_f(kg, sd, stack, node.y, node.z); - break; - case NODE_VALUE_V: - offset = svm_node_value_v(kg, sd, stack, node.y, offset); - break; - case NODE_ATTR: - svm_node_attr<node_feature_mask>(kg, sd, stack, node); - break; - case NODE_VERTEX_COLOR: - svm_node_vertex_color(kg, sd, stack, node.y, node.z, node.w); - break; - case NODE_GEOMETRY_BUMP_DX: - IF_KERNEL_NODES_FEATURE(BUMP) - { - svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z); - } - break; - case NODE_GEOMETRY_BUMP_DY: - IF_KERNEL_NODES_FEATURE(BUMP) - { - svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z); - } - break; - case NODE_SET_DISPLACEMENT: - svm_node_set_displacement<node_feature_mask>(kg, sd, stack, node.y); - break; - case NODE_DISPLACEMENT: - svm_node_displacement<node_feature_mask>(kg, sd, stack, node); - break; - case NODE_VECTOR_DISPLACEMENT: - offset = svm_node_vector_displacement<node_feature_mask>(kg, sd, stack, node, offset); - break; - case NODE_TEX_IMAGE: - offset = svm_node_tex_image(kg, sd, stack, node, offset); - break; - case NODE_TEX_IMAGE_BOX: - svm_node_tex_image_box(kg, sd, stack, node); - break; - case NODE_TEX_NOISE: - offset = svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, offset); - break; - case NODE_SET_BUMP: - svm_node_set_bump<node_feature_mask>(kg, sd, stack, node); - break; - case NODE_ATTR_BUMP_DX: - IF_KERNEL_NODES_FEATURE(BUMP) - { - svm_node_attr_bump_dx(kg, sd, stack, node); - } - break; - case NODE_ATTR_BUMP_DY: - IF_KERNEL_NODES_FEATURE(BUMP) - { - svm_node_attr_bump_dy(kg, sd, stack, node); - } - break; - case NODE_VERTEX_COLOR_BUMP_DX: - IF_KERNEL_NODES_FEATURE(BUMP) - { - svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w); - } - break; - case NODE_VERTEX_COLOR_BUMP_DY: - IF_KERNEL_NODES_FEATURE(BUMP) - { - svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w); - } - break; - case NODE_TEX_COORD_BUMP_DX: - IF_KERNEL_NODES_FEATURE(BUMP) - { - offset = svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, offset); - } - break; - case NODE_TEX_COORD_BUMP_DY: - IF_KERNEL_NODES_FEATURE(BUMP) - { - offset = svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, offset); - } - break; - case NODE_CLOSURE_SET_NORMAL: - IF_KERNEL_NODES_FEATURE(BUMP) - { - svm_node_set_normal(kg, sd, stack, node.y, node.z); - } - break; - case NODE_ENTER_BUMP_EVAL: - IF_KERNEL_NODES_FEATURE(BUMP_STATE) - { - svm_node_enter_bump_eval(kg, sd, stack, node.y); - } - break; - case NODE_LEAVE_BUMP_EVAL: - IF_KERNEL_NODES_FEATURE(BUMP_STATE) - { - svm_node_leave_bump_eval(kg, sd, stack, node.y); - } - break; - case NODE_HSV: - svm_node_hsv(kg, sd, stack, node); - break; - - case NODE_CLOSURE_HOLDOUT: - svm_node_closure_holdout(sd, stack, node); - break; - case NODE_FRESNEL: - svm_node_fresnel(sd, stack, node.y, node.z, node.w); - break; - case NODE_LAYER_WEIGHT: - svm_node_layer_weight(sd, stack, node); - break; - case NODE_CLOSURE_VOLUME: - IF_KERNEL_NODES_FEATURE(VOLUME) - { - svm_node_closure_volume<type>(kg, sd, stack, node); - } - break; - case NODE_PRINCIPLED_VOLUME: - IF_KERNEL_NODES_FEATURE(VOLUME) - { - offset = svm_node_principled_volume<type>(kg, sd, stack, node, path_flag, offset); - } - break; - case NODE_MATH: - svm_node_math(kg, sd, stack, node.y, node.z, node.w); - break; - case NODE_VECTOR_MATH: - offset = svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, offset); - break; - case NODE_RGB_RAMP: - offset = svm_node_rgb_ramp(kg, sd, stack, node, offset); - break; - case NODE_GAMMA: - svm_node_gamma(sd, stack, node.y, node.z, node.w); - break; - case NODE_BRIGHTCONTRAST: - svm_node_brightness(sd, stack, node.y, node.z, node.w); - break; - case NODE_LIGHT_PATH: - svm_node_light_path<node_feature_mask>(kg, state, sd, stack, node.y, node.z, path_flag); - break; - case NODE_OBJECT_INFO: - svm_node_object_info(kg, sd, stack, node.y, node.z); - break; - case NODE_PARTICLE_INFO: - svm_node_particle_info(kg, sd, stack, node.y, node.z); - break; + SVM_CASE(NODE_CLOSURE_BSDF) + offset = svm_node_closure_bsdf<node_feature_mask, type>( + kg, sd, stack, node, path_flag, offset); + break; + SVM_CASE(NODE_CLOSURE_EMISSION) + IF_KERNEL_NODES_FEATURE(EMISSION) + { + svm_node_closure_emission(sd, stack, node); + } + break; + SVM_CASE(NODE_CLOSURE_BACKGROUND) + IF_KERNEL_NODES_FEATURE(EMISSION) + { + svm_node_closure_background(sd, stack, node); + } + break; + SVM_CASE(NODE_CLOSURE_SET_WEIGHT) + svm_node_closure_set_weight(sd, node.y, node.z, node.w); + break; + SVM_CASE(NODE_CLOSURE_WEIGHT) + svm_node_closure_weight(sd, stack, node.y); + break; + SVM_CASE(NODE_EMISSION_WEIGHT) + IF_KERNEL_NODES_FEATURE(EMISSION) + { + svm_node_emission_weight(kg, sd, stack, node); + } + break; + SVM_CASE(NODE_MIX_CLOSURE) + svm_node_mix_closure(sd, stack, node); + break; + SVM_CASE(NODE_JUMP_IF_ZERO) + if (stack_load_float(stack, node.z) <= 0.0f) + offset += node.y; + break; + SVM_CASE(NODE_JUMP_IF_ONE) + if (stack_load_float(stack, node.z) >= 1.0f) + offset += node.y; + break; + SVM_CASE(NODE_GEOMETRY) + svm_node_geometry(kg, sd, stack, node.y, node.z); + break; + SVM_CASE(NODE_CONVERT) + svm_node_convert(kg, sd, stack, node.y, node.z, node.w); + break; + SVM_CASE(NODE_TEX_COORD) + offset = svm_node_tex_coord(kg, sd, path_flag, stack, node, offset); + break; + SVM_CASE(NODE_VALUE_F) + svm_node_value_f(kg, sd, stack, node.y, node.z); + break; + SVM_CASE(NODE_VALUE_V) + offset = svm_node_value_v(kg, sd, stack, node.y, offset); + break; + SVM_CASE(NODE_ATTR) + svm_node_attr<node_feature_mask>(kg, sd, stack, node); + break; + SVM_CASE(NODE_VERTEX_COLOR) + svm_node_vertex_color(kg, sd, stack, node.y, node.z, node.w); + break; + SVM_CASE(NODE_GEOMETRY_BUMP_DX) + IF_KERNEL_NODES_FEATURE(BUMP) + { + svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z); + } + break; + SVM_CASE(NODE_GEOMETRY_BUMP_DY) + IF_KERNEL_NODES_FEATURE(BUMP) + { + svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z); + } + break; + SVM_CASE(NODE_SET_DISPLACEMENT) + svm_node_set_displacement<node_feature_mask>(kg, sd, stack, node.y); + break; + SVM_CASE(NODE_DISPLACEMENT) + svm_node_displacement<node_feature_mask>(kg, sd, stack, node); + break; + SVM_CASE(NODE_VECTOR_DISPLACEMENT) + offset = svm_node_vector_displacement<node_feature_mask>(kg, sd, stack, node, offset); + break; + SVM_CASE(NODE_TEX_IMAGE) + offset = svm_node_tex_image(kg, sd, stack, node, offset); + break; + SVM_CASE(NODE_TEX_IMAGE_BOX) + svm_node_tex_image_box(kg, sd, stack, node); + break; + SVM_CASE(NODE_TEX_NOISE) + offset = svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, offset); + break; + SVM_CASE(NODE_SET_BUMP) + svm_node_set_bump<node_feature_mask>(kg, sd, stack, node); + break; + SVM_CASE(NODE_ATTR_BUMP_DX) + IF_KERNEL_NODES_FEATURE(BUMP) + { + svm_node_attr_bump_dx(kg, sd, stack, node); + } + break; + SVM_CASE(NODE_ATTR_BUMP_DY) + IF_KERNEL_NODES_FEATURE(BUMP) + { + svm_node_attr_bump_dy(kg, sd, stack, node); + } + break; + SVM_CASE(NODE_VERTEX_COLOR_BUMP_DX) + IF_KERNEL_NODES_FEATURE(BUMP) + { + svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w); + } + break; + SVM_CASE(NODE_VERTEX_COLOR_BUMP_DY) + IF_KERNEL_NODES_FEATURE(BUMP) + { + svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w); + } + break; + SVM_CASE(NODE_TEX_COORD_BUMP_DX) + IF_KERNEL_NODES_FEATURE(BUMP) + { + offset = svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, offset); + } + break; + SVM_CASE(NODE_TEX_COORD_BUMP_DY) + IF_KERNEL_NODES_FEATURE(BUMP) + { + offset = svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, offset); + } + break; + SVM_CASE(NODE_CLOSURE_SET_NORMAL) + IF_KERNEL_NODES_FEATURE(BUMP) + { + svm_node_set_normal(kg, sd, stack, node.y, node.z); + } + break; + SVM_CASE(NODE_ENTER_BUMP_EVAL) + IF_KERNEL_NODES_FEATURE(BUMP_STATE) + { + svm_node_enter_bump_eval(kg, sd, stack, node.y); + } + break; + SVM_CASE(NODE_LEAVE_BUMP_EVAL) + IF_KERNEL_NODES_FEATURE(BUMP_STATE) + { + svm_node_leave_bump_eval(kg, sd, stack, node.y); + } + break; + SVM_CASE(NODE_HSV) + svm_node_hsv(kg, sd, stack, node); + break; + SVM_CASE(NODE_CLOSURE_HOLDOUT) + svm_node_closure_holdout(sd, stack, node); + break; + SVM_CASE(NODE_FRESNEL) + svm_node_fresnel(sd, stack, node.y, node.z, node.w); + break; + SVM_CASE(NODE_LAYER_WEIGHT) + svm_node_layer_weight(sd, stack, node); + break; + SVM_CASE(NODE_CLOSURE_VOLUME) + IF_KERNEL_NODES_FEATURE(VOLUME) + { + svm_node_closure_volume<type>(kg, sd, stack, node); + } + break; + SVM_CASE(NODE_PRINCIPLED_VOLUME) + IF_KERNEL_NODES_FEATURE(VOLUME) + { + offset = svm_node_principled_volume<type>(kg, sd, stack, node, path_flag, offset); + } + break; + SVM_CASE(NODE_MATH) + svm_node_math(kg, sd, stack, node.y, node.z, node.w); + break; + SVM_CASE(NODE_VECTOR_MATH) + offset = svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, offset); + break; + SVM_CASE(NODE_RGB_RAMP) + offset = svm_node_rgb_ramp(kg, sd, stack, node, offset); + break; + SVM_CASE(NODE_GAMMA) + svm_node_gamma(sd, stack, node.y, node.z, node.w); + break; + SVM_CASE(NODE_BRIGHTCONTRAST) + svm_node_brightness(sd, stack, node.y, node.z, node.w); + break; + SVM_CASE(NODE_LIGHT_PATH) + svm_node_light_path<node_feature_mask>(kg, state, sd, stack, node.y, node.z, path_flag); + break; + SVM_CASE(NODE_OBJECT_INFO) + svm_node_object_info(kg, sd, stack, node.y, node.z); + break; + SVM_CASE(NODE_PARTICLE_INFO) + svm_node_particle_info(kg, sd, stack, node.y, node.z); + break; #if defined(__HAIR__) - case NODE_HAIR_INFO: - svm_node_hair_info(kg, sd, stack, node.y, node.z); - break; + SVM_CASE(NODE_HAIR_INFO) + svm_node_hair_info(kg, sd, stack, node.y, node.z); + break; #endif #if defined(__POINTCLOUD__) - case NODE_POINT_INFO: - svm_node_point_info(kg, sd, stack, node.y, node.z); - break; + SVM_CASE(NODE_POINT_INFO) + svm_node_point_info(kg, sd, stack, node.y, node.z); + break; #endif - case NODE_TEXTURE_MAPPING: - offset = svm_node_texture_mapping(kg, sd, stack, node.y, node.z, offset); - break; - case NODE_MAPPING: - svm_node_mapping(kg, sd, stack, node.y, node.z, node.w); - break; - case NODE_MIN_MAX: - offset = svm_node_min_max(kg, sd, stack, node.y, node.z, offset); - break; - case NODE_CAMERA: - svm_node_camera(kg, sd, stack, node.y, node.z, node.w); - break; - case NODE_TEX_ENVIRONMENT: - svm_node_tex_environment(kg, sd, stack, node); - break; - case NODE_TEX_SKY: - offset = svm_node_tex_sky(kg, sd, stack, node, offset); - break; - case NODE_TEX_GRADIENT: - svm_node_tex_gradient(sd, stack, node); - break; - case NODE_TEX_VORONOI: - offset = svm_node_tex_voronoi<node_feature_mask>( - kg, sd, stack, node.y, node.z, node.w, offset); - break; - case NODE_TEX_MUSGRAVE: - offset = svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, offset); - break; - case NODE_TEX_WAVE: - offset = svm_node_tex_wave(kg, sd, stack, node, offset); - break; - case NODE_TEX_MAGIC: - offset = svm_node_tex_magic(kg, sd, stack, node, offset); - break; - case NODE_TEX_CHECKER: - svm_node_tex_checker(kg, sd, stack, node); - break; - case NODE_TEX_BRICK: - offset = svm_node_tex_brick(kg, sd, stack, node, offset); - break; - case NODE_TEX_WHITE_NOISE: - svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w); - break; - case NODE_NORMAL: - offset = svm_node_normal(kg, sd, stack, node.y, node.z, node.w, offset); - break; - case NODE_LIGHT_FALLOFF: - svm_node_light_falloff(sd, stack, node); - break; - case NODE_IES: - svm_node_ies(kg, sd, stack, node); - break; - case NODE_RGB_CURVES: - case NODE_VECTOR_CURVES: - offset = svm_node_curves(kg, sd, stack, node, offset); - break; - case NODE_FLOAT_CURVE: - offset = svm_node_curve(kg, sd, stack, node, offset); - break; - case NODE_TANGENT: - svm_node_tangent(kg, sd, stack, node); - break; - case NODE_NORMAL_MAP: - svm_node_normal_map(kg, sd, stack, node); - break; - case NODE_INVERT: - svm_node_invert(sd, stack, node.y, node.z, node.w); - break; - case NODE_MIX: - offset = svm_node_mix(kg, sd, stack, node.y, node.z, node.w, offset); - break; - case NODE_SEPARATE_COLOR: - svm_node_separate_color(kg, sd, stack, node.y, node.z, node.w); - break; - case NODE_COMBINE_COLOR: - svm_node_combine_color(kg, sd, stack, node.y, node.z, node.w); - break; - case NODE_SEPARATE_VECTOR: - svm_node_separate_vector(sd, stack, node.y, node.z, node.w); - break; - case NODE_COMBINE_VECTOR: - svm_node_combine_vector(sd, stack, node.y, node.z, node.w); - break; - case NODE_SEPARATE_HSV: - offset = svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, offset); - break; - case NODE_COMBINE_HSV: - offset = svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, offset); - break; - case NODE_VECTOR_ROTATE: - svm_node_vector_rotate(sd, stack, node.y, node.z, node.w); - break; - case NODE_VECTOR_TRANSFORM: - svm_node_vector_transform(kg, sd, stack, node); - break; - case NODE_WIREFRAME: - svm_node_wireframe(kg, sd, stack, node); - break; - case NODE_WAVELENGTH: - svm_node_wavelength(kg, sd, stack, node.y, node.z); - break; - case NODE_BLACKBODY: - svm_node_blackbody(kg, sd, stack, node.y, node.z); - break; - case NODE_MAP_RANGE: - offset = svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, offset); - break; - case NODE_VECTOR_MAP_RANGE: - offset = svm_node_vector_map_range(kg, sd, stack, node.y, node.z, node.w, offset); - break; - case NODE_CLAMP: - offset = svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, offset); - break; + SVM_CASE(NODE_TEXTURE_MAPPING) + offset = svm_node_texture_mapping(kg, sd, stack, node.y, node.z, offset); + break; + SVM_CASE(NODE_MAPPING) + svm_node_mapping(kg, sd, stack, node.y, node.z, node.w); + break; + SVM_CASE(NODE_MIN_MAX) + offset = svm_node_min_max(kg, sd, stack, node.y, node.z, offset); + break; + SVM_CASE(NODE_CAMERA) + svm_node_camera(kg, sd, stack, node.y, node.z, node.w); + break; + SVM_CASE(NODE_TEX_ENVIRONMENT) + svm_node_tex_environment(kg, sd, stack, node); + break; + SVM_CASE(NODE_TEX_SKY) + offset = svm_node_tex_sky(kg, sd, stack, node, offset); + break; + SVM_CASE(NODE_TEX_GRADIENT) + svm_node_tex_gradient(sd, stack, node); + break; + SVM_CASE(NODE_TEX_VORONOI) + offset = svm_node_tex_voronoi<node_feature_mask>( + kg, sd, stack, node.y, node.z, node.w, offset); + break; + SVM_CASE(NODE_TEX_MUSGRAVE) + offset = svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, offset); + break; + SVM_CASE(NODE_TEX_WAVE) + offset = svm_node_tex_wave(kg, sd, stack, node, offset); + break; + SVM_CASE(NODE_TEX_MAGIC) + offset = svm_node_tex_magic(kg, sd, stack, node, offset); + break; + SVM_CASE(NODE_TEX_CHECKER) + svm_node_tex_checker(kg, sd, stack, node); + break; + SVM_CASE(NODE_TEX_BRICK) + offset = svm_node_tex_brick(kg, sd, stack, node, offset); + break; + SVM_CASE(NODE_TEX_WHITE_NOISE) + svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w); + break; + SVM_CASE(NODE_NORMAL) + offset = svm_node_normal(kg, sd, stack, node.y, node.z, node.w, offset); + break; + SVM_CASE(NODE_LIGHT_FALLOFF) + svm_node_light_falloff(sd, stack, node); + break; + SVM_CASE(NODE_IES) + svm_node_ies(kg, sd, stack, node); + break; + SVM_CASE(NODE_CURVES) + offset = svm_node_curves(kg, sd, stack, node, offset); + break; + SVM_CASE(NODE_FLOAT_CURVE) + offset = svm_node_curve(kg, sd, stack, node, offset); + break; + SVM_CASE(NODE_TANGENT) + svm_node_tangent(kg, sd, stack, node); + break; + SVM_CASE(NODE_NORMAL_MAP) + svm_node_normal_map(kg, sd, stack, node); + break; + SVM_CASE(NODE_INVERT) + svm_node_invert(sd, stack, node.y, node.z, node.w); + break; + SVM_CASE(NODE_MIX) + offset = svm_node_mix(kg, sd, stack, node.y, node.z, node.w, offset); + break; + SVM_CASE(NODE_SEPARATE_COLOR) + svm_node_separate_color(kg, sd, stack, node.y, node.z, node.w); + break; + SVM_CASE(NODE_COMBINE_COLOR) + svm_node_combine_color(kg, sd, stack, node.y, node.z, node.w); + break; + SVM_CASE(NODE_SEPARATE_VECTOR) + svm_node_separate_vector(sd, stack, node.y, node.z, node.w); + break; + SVM_CASE(NODE_COMBINE_VECTOR) + svm_node_combine_vector(sd, stack, node.y, node.z, node.w); + break; + SVM_CASE(NODE_SEPARATE_HSV) + offset = svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, offset); + break; + SVM_CASE(NODE_COMBINE_HSV) + offset = svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, offset); + break; + SVM_CASE(NODE_VECTOR_ROTATE) + svm_node_vector_rotate(sd, stack, node.y, node.z, node.w); + break; + SVM_CASE(NODE_VECTOR_TRANSFORM) + svm_node_vector_transform(kg, sd, stack, node); + break; + SVM_CASE(NODE_WIREFRAME) + svm_node_wireframe(kg, sd, stack, node); + break; + SVM_CASE(NODE_WAVELENGTH) + svm_node_wavelength(kg, sd, stack, node.y, node.z); + break; + SVM_CASE(NODE_BLACKBODY) + svm_node_blackbody(kg, sd, stack, node.y, node.z); + break; + SVM_CASE(NODE_MAP_RANGE) + offset = svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, offset); + break; + SVM_CASE(NODE_VECTOR_MAP_RANGE) + offset = svm_node_vector_map_range(kg, sd, stack, node.y, node.z, node.w, offset); + break; + SVM_CASE(NODE_CLAMP) + offset = svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, offset); + break; #ifdef __SHADER_RAYTRACE__ - case NODE_BEVEL: - svm_node_bevel<node_feature_mask>(kg, state, sd, stack, node); - break; - case NODE_AMBIENT_OCCLUSION: - svm_node_ao<node_feature_mask>(kg, state, sd, stack, node); - break; + SVM_CASE(NODE_BEVEL) + svm_node_bevel<node_feature_mask>(kg, state, sd, stack, node); + break; + SVM_CASE(NODE_AMBIENT_OCCLUSION) + svm_node_ao<node_feature_mask>(kg, state, sd, stack, node); + break; #endif - case NODE_TEX_VOXEL: - IF_KERNEL_NODES_FEATURE(VOLUME) - { - offset = svm_node_tex_voxel(kg, sd, stack, node, offset); - } - break; - case NODE_AOV_START: - if (!svm_node_aov_check(path_flag, render_buffer)) { - return; - } - break; - case NODE_AOV_COLOR: - svm_node_aov_color<node_feature_mask>(kg, state, sd, stack, node, render_buffer); - break; - case NODE_AOV_VALUE: - svm_node_aov_value<node_feature_mask>(kg, state, sd, stack, node, render_buffer); - break; + SVM_CASE(NODE_TEX_VOXEL) + IF_KERNEL_NODES_FEATURE(VOLUME) + { + offset = svm_node_tex_voxel(kg, sd, stack, node, offset); + } + break; + SVM_CASE(NODE_AOV_START) + if (!svm_node_aov_check(path_flag, render_buffer)) { + return; + } + break; + SVM_CASE(NODE_AOV_COLOR) + svm_node_aov_color<node_feature_mask>(kg, state, sd, stack, node, render_buffer); + break; + SVM_CASE(NODE_AOV_VALUE) + svm_node_aov_value<node_feature_mask>(kg, state, sd, stack, node, render_buffer); + break; default: kernel_assert(!"Unknown node type was passed to the SVM machine"); return; diff --git a/intern/cycles/kernel/svm/tex_coord.h b/intern/cycles/kernel/svm/tex_coord.h index d9138796c45..2a0130e11d4 100644 --- a/intern/cycles/kernel/svm/tex_coord.h +++ b/intern/cycles/kernel/svm/tex_coord.h @@ -138,7 +138,7 @@ ccl_device_noinline int svm_node_tex_coord_bump_dx(KernelGlobals kg, case NODE_TEXCO_WINDOW: { if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f)); + data = camera_world_to_ndc(kg, sd, sd->ray_P); else data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx); data.z = 0.0f; @@ -223,7 +223,7 @@ ccl_device_noinline int svm_node_tex_coord_bump_dy(KernelGlobals kg, case NODE_TEXCO_WINDOW: { if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f)); + data = camera_world_to_ndc(kg, sd, sd->ray_P); else data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy); data.z = 0.0f; diff --git a/intern/cycles/kernel/svm/types.h b/intern/cycles/kernel/svm/types.h index 82109ec4c4f..12d0ec141e6 100644 --- a/intern/cycles/kernel/svm/types.h +++ b/intern/cycles/kernel/svm/types.h @@ -17,104 +17,9 @@ CCL_NAMESPACE_BEGIN /* Nodes */ typedef enum ShaderNodeType { - NODE_END = 0, - NODE_SHADER_JUMP, - NODE_CLOSURE_BSDF, - NODE_CLOSURE_EMISSION, - NODE_CLOSURE_BACKGROUND, - NODE_CLOSURE_SET_WEIGHT, - NODE_CLOSURE_WEIGHT, - NODE_EMISSION_WEIGHT, - NODE_MIX_CLOSURE, - NODE_JUMP_IF_ZERO, - NODE_JUMP_IF_ONE, - NODE_GEOMETRY, - NODE_CONVERT, - NODE_TEX_COORD, - NODE_VALUE_F, - NODE_VALUE_V, - NODE_ATTR, - NODE_VERTEX_COLOR, - NODE_GEOMETRY_BUMP_DX, - NODE_GEOMETRY_BUMP_DY, - NODE_SET_DISPLACEMENT, - NODE_DISPLACEMENT, - NODE_VECTOR_DISPLACEMENT, - NODE_TEX_IMAGE, - NODE_TEX_IMAGE_BOX, - NODE_TEX_NOISE, - NODE_SET_BUMP, - NODE_ATTR_BUMP_DX, - NODE_ATTR_BUMP_DY, - NODE_VERTEX_COLOR_BUMP_DX, - NODE_VERTEX_COLOR_BUMP_DY, - NODE_TEX_COORD_BUMP_DX, - NODE_TEX_COORD_BUMP_DY, - NODE_CLOSURE_SET_NORMAL, - NODE_ENTER_BUMP_EVAL, - NODE_LEAVE_BUMP_EVAL, - NODE_HSV, - NODE_CLOSURE_HOLDOUT, - NODE_FRESNEL, - NODE_LAYER_WEIGHT, - NODE_CLOSURE_VOLUME, - NODE_PRINCIPLED_VOLUME, - NODE_MATH, - NODE_VECTOR_MATH, - NODE_RGB_RAMP, - NODE_GAMMA, - NODE_BRIGHTCONTRAST, - NODE_LIGHT_PATH, - NODE_OBJECT_INFO, - NODE_PARTICLE_INFO, - NODE_HAIR_INFO, - NODE_POINT_INFO, - NODE_TEXTURE_MAPPING, - NODE_MAPPING, - NODE_MIN_MAX, - NODE_CAMERA, - NODE_TEX_ENVIRONMENT, - NODE_TEX_SKY, - NODE_TEX_GRADIENT, - NODE_TEX_VORONOI, - NODE_TEX_MUSGRAVE, - NODE_TEX_WAVE, - NODE_TEX_MAGIC, - NODE_TEX_CHECKER, - NODE_TEX_BRICK, - NODE_TEX_WHITE_NOISE, - NODE_NORMAL, - NODE_LIGHT_FALLOFF, - NODE_IES, - NODE_RGB_CURVES, - NODE_VECTOR_CURVES, - NODE_TANGENT, - NODE_NORMAL_MAP, - NODE_INVERT, - NODE_MIX, - NODE_SEPARATE_COLOR, - NODE_COMBINE_COLOR, - NODE_SEPARATE_VECTOR, - NODE_COMBINE_VECTOR, - NODE_SEPARATE_HSV, - NODE_COMBINE_HSV, - NODE_VECTOR_ROTATE, - NODE_VECTOR_TRANSFORM, - NODE_WIREFRAME, - NODE_WAVELENGTH, - NODE_BLACKBODY, - NODE_MAP_RANGE, - NODE_VECTOR_MAP_RANGE, - NODE_CLAMP, - NODE_BEVEL, - NODE_AMBIENT_OCCLUSION, - NODE_TEX_VOXEL, - NODE_AOV_START, - NODE_AOV_COLOR, - NODE_AOV_VALUE, - NODE_FLOAT_CURVE, - /* NOTE: for best OpenCL performance, item definition in the enum must - * match the switch case order in `svm.h`. */ +#define SHADER_NODE_TYPE(name) name, +#include "node_types_template.h" + NODE_NUM } ShaderNodeType; typedef enum NodeAttributeOutputType { diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index f2e61d25002..05320deed19 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -535,7 +535,8 @@ typedef struct RaySelfPrimitives { typedef struct Ray { float3 P; /* origin */ float3 D; /* direction */ - float t; /* length of the ray */ + float tmin; /* start distance */ + float tmax; /* end distance */ float time; /* time (for motion blur) */ RaySelfPrimitives self; @@ -1072,94 +1073,6 @@ typedef struct KernelCamera { } KernelCamera; static_assert_align(KernelCamera, 16); -typedef struct KernelFilm { - float exposure; - int pass_flag; - - int light_pass_flag; - int pass_stride; - - int pass_combined; - int pass_depth; - int pass_position; - int pass_normal; - int pass_roughness; - int pass_motion; - - int pass_motion_weight; - int pass_uv; - int pass_object_id; - int pass_material_id; - - int pass_diffuse_color; - int pass_glossy_color; - int pass_transmission_color; - - int pass_diffuse_indirect; - int pass_glossy_indirect; - int pass_transmission_indirect; - int pass_volume_indirect; - - int pass_diffuse_direct; - int pass_glossy_direct; - int pass_transmission_direct; - int pass_volume_direct; - - int pass_emission; - int pass_background; - int pass_ao; - float pass_alpha_threshold; - - int pass_shadow; - float pass_shadow_scale; - - int pass_shadow_catcher; - int pass_shadow_catcher_sample_count; - int pass_shadow_catcher_matte; - - int filter_table_offset; - - int cryptomatte_passes; - int cryptomatte_depth; - int pass_cryptomatte; - - int pass_adaptive_aux_buffer; - int pass_sample_count; - - int pass_mist; - float mist_start; - float mist_inv_depth; - float mist_falloff; - - int pass_denoising_normal; - int pass_denoising_albedo; - int pass_denoising_depth; - - int pass_aov_color; - int pass_aov_value; - int pass_lightgroup; - - /* XYZ to rendering color space transform. float4 instead of float3 to - * ensure consistent padding/alignment across devices. */ - float4 xyz_to_r; - float4 xyz_to_g; - float4 xyz_to_b; - float4 rgb_to_y; - /* Rec709 to rendering color space. */ - float4 rec709_to_r; - float4 rec709_to_g; - float4 rec709_to_b; - int is_rec709; - - int pass_bake_primitive; - int pass_bake_differential; - - int use_approximate_shadow_catcher; - - int pad1; -} KernelFilm; -static_assert_align(KernelFilm, 16); - typedef struct KernelFilmConvert { int pass_offset; int pass_stride; @@ -1201,108 +1114,6 @@ typedef struct KernelFilmConvert { } KernelFilmConvert; static_assert_align(KernelFilmConvert, 16); -typedef struct KernelBackground { - /* only shader index */ - int surface_shader; - int volume_shader; - float volume_step_size; - int transparent; - float transparent_roughness_squared_threshold; - - /* portal sampling */ - float portal_weight; - int num_portals; - int portal_offset; - - /* sun sampling */ - float sun_weight; - /* xyz store direction, w the angle. float4 instead of float3 is used - * to ensure consistent padding/alignment across devices. */ - float4 sun; - - /* map sampling */ - float map_weight; - int map_res_x; - int map_res_y; - - int use_mis; - - int lightgroup; - - /* Padding */ - int pad1, pad2; -} KernelBackground; -static_assert_align(KernelBackground, 16); - -typedef struct KernelIntegrator { - /* emission */ - int use_direct_light; - int num_distribution; - int num_all_lights; - float pdf_triangles; - float pdf_lights; - float light_inv_rr_threshold; - - /* bounces */ - int min_bounce; - int max_bounce; - - int max_diffuse_bounce; - int max_glossy_bounce; - int max_transmission_bounce; - int max_volume_bounce; - - /* AO bounces */ - int ao_bounces; - float ao_bounces_distance; - float ao_bounces_factor; - float ao_additive_factor; - - /* transparent */ - int transparent_min_bounce; - int transparent_max_bounce; - int transparent_shadows; - - /* caustics */ - int caustics_reflective; - int caustics_refractive; - float filter_glossy; - - /* seed */ - int seed; - - /* clamp */ - float sample_clamp_direct; - float sample_clamp_indirect; - - /* mis */ - int use_lamp_mis; - - /* caustics */ - int use_caustics; - - /* sampler */ - int sampling_pattern; - - /* volume render */ - int use_volumes; - int volume_max_steps; - float volume_step_rate; - - int has_shadow_catcher; - float scrambling_distance; - - /* Closure filter. */ - int filter_closures; - - /* MIS debugging. */ - int direct_light_sampling_type; - - /* padding */ - int pad1; -} KernelIntegrator; -static_assert_align(KernelIntegrator, 16); - typedef enum KernelBVHLayout { BVH_LAYOUT_NONE = 0, @@ -1320,36 +1131,25 @@ typedef enum KernelBVHLayout { BVH_LAYOUT_ALL = BVH_LAYOUT_BVH2 | BVH_LAYOUT_EMBREE | BVH_LAYOUT_OPTIX | BVH_LAYOUT_METAL, } KernelBVHLayout; -typedef struct KernelBVH { - /* Own BVH */ - int root; - int have_motion; - int have_curves; - int bvh_layout; - int use_bvh_steps; - int curve_subdivisions; +/* Specialized struct that can become constants in dynamic compilation. */ +#define KERNEL_STRUCT_BEGIN(name, parent) struct name { +#define KERNEL_STRUCT_END(name) \ + } \ + ; \ + static_assert_align(name, 16); - /* Custom BVH */ -#ifdef __KERNEL_OPTIX__ - OptixTraversableHandle scene; -#elif defined __METALRT__ - metalrt_as_type scene; +#ifdef __KERNEL_USE_DATA_CONSTANTS__ +# define KERNEL_STRUCT_MEMBER(parent, type, name) type __unused_##name; #else -# ifdef __EMBREE__ - RTCScene scene; -# ifndef __KERNEL_64_BIT__ - int pad2; -# endif -# else - int scene, pad2; -# endif +# define KERNEL_STRUCT_MEMBER(parent, type, name) type name; #endif -} KernelBVH; -static_assert_align(KernelBVH, 16); + +#include "kernel/data_template.h" typedef struct KernelTables { int beckmann_offset; - int pad1, pad2, pad3; + int filter_table_offset; + int pad1, pad2; } KernelTables; static_assert_align(KernelTables, 16); @@ -1362,18 +1162,37 @@ typedef struct KernelBake { static_assert_align(KernelBake, 16); typedef struct KernelData { + /* Features and limits. */ uint kernel_features; uint max_closures; uint max_shaders; uint volume_stack_size; + /* Always dynamic data mambers. */ KernelCamera cam; - KernelFilm film; - KernelBackground background; - KernelIntegrator integrator; - KernelBVH bvh; - KernelTables tables; KernelBake bake; + KernelTables tables; + + /* Potentially specialized data members. */ +#define KERNEL_STRUCT_BEGIN(name, parent) name parent; +#include "kernel/data_template.h" + + /* Device specific BVH. */ +#ifdef __KERNEL_OPTIX__ + OptixTraversableHandle device_bvh; +#elif defined __METALRT__ + metalrt_as_type device_bvh; +#else +# ifdef __EMBREE__ + RTCScene device_bvh; +# ifndef __KERNEL_64_BIT__ + int pad1; +# endif +# else + int device_bvh, pad1; +# endif +#endif + int pad2, pad3; } KernelData; static_assert_align(KernelData, 16); diff --git a/intern/cycles/scene/film.cpp b/intern/cycles/scene/film.cpp index 8239ee84b82..a6a8f90a449 100644 --- a/intern/cycles/scene/film.cpp +++ b/intern/cycles/scene/film.cpp @@ -394,7 +394,7 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) vector<float> table = filter_table(filter_type, filter_width); scene->lookup_tables->remove_table(&filter_table_offset_); filter_table_offset_ = scene->lookup_tables->add_table(dscene, table); - kfilm->filter_table_offset = (int)filter_table_offset_; + dscene->data.tables.filter_table_offset = (int)filter_table_offset_; /* mist pass parameters */ kfilm->mist_start = mist_start; diff --git a/intern/cycles/scene/geometry.cpp b/intern/cycles/scene/geometry.cpp index bdc8839e277..67ff118692e 100644 --- a/intern/cycles/scene/geometry.cpp +++ b/intern/cycles/scene/geometry.cpp @@ -1362,7 +1362,7 @@ void GeometryManager::device_update_bvh(Device *device, dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0); dscene->data.bvh.curve_subdivisions = scene->params.curve_subdivisions(); /* The scene handle is set in 'CPUDevice::const_copy_to' and 'OptiXDevice::const_copy_to' */ - dscene->data.bvh.scene = 0; + dscene->data.device_bvh = 0; } /* Set of flags used to help determining what data has been modified or needs reallocation, so we diff --git a/intern/cycles/scene/scene.cpp b/intern/cycles/scene/scene.cpp index eedb2a4fa3a..18cd665ac74 100644 --- a/intern/cycles/scene/scene.cpp +++ b/intern/cycles/scene/scene.cpp @@ -369,6 +369,8 @@ void Scene::device_update(Device *device_, Progress &progress) device->const_copy_to("data", &dscene.data, sizeof(dscene.data)); } + device->optimize_for_scene(this); + if (print_stats) { size_t mem_used = util_guarded_get_mem_used(); size_t mem_peak = util_guarded_get_mem_peak(); diff --git a/intern/cycles/scene/scene.h b/intern/cycles/scene/scene.h index d04c6a27f11..d1004bb7b66 100644 --- a/intern/cycles/scene/scene.h +++ b/intern/cycles/scene/scene.h @@ -82,7 +82,7 @@ class DeviceScene { device_vector<uint> patches; - /* pointcloud */ + /* point-cloud */ device_vector<float4> points; device_vector<uint> points_shader; @@ -124,7 +124,7 @@ class DeviceScene { /* integrator */ device_vector<float> sample_pattern_lut; - /* ies lights */ + /* IES lights */ device_vector<float> ies_lights; KernelData data; diff --git a/intern/cycles/scene/shader_nodes.cpp b/intern/cycles/scene/shader_nodes.cpp index f93a1a5231a..bedb0fe2902 100644 --- a/intern/cycles/scene/shader_nodes.cpp +++ b/intern/cycles/scene/shader_nodes.cpp @@ -6671,7 +6671,7 @@ void CurvesNode::compile(SVMCompiler &compiler, ShaderInput *fac_in = input("Fac"); - compiler.add_node(type, + compiler.add_node(ShaderNodeType(type), compiler.encode_uchar4(compiler.stack_assign(fac_in), compiler.stack_assign(value_in), compiler.stack_assign(value_out), @@ -6736,7 +6736,7 @@ void RGBCurvesNode::constant_fold(const ConstantFolder &folder) void RGBCurvesNode::compile(SVMCompiler &compiler) { - CurvesNode::compile(compiler, NODE_RGB_CURVES, input("Color"), output("Color")); + CurvesNode::compile(compiler, NODE_CURVES, input("Color"), output("Color")); } void RGBCurvesNode::compile(OSLCompiler &compiler) @@ -6774,7 +6774,7 @@ void VectorCurvesNode::constant_fold(const ConstantFolder &folder) void VectorCurvesNode::compile(SVMCompiler &compiler) { - CurvesNode::compile(compiler, NODE_VECTOR_CURVES, input("Vector"), output("Vector")); + CurvesNode::compile(compiler, NODE_CURVES, input("Vector"), output("Vector")); } void VectorCurvesNode::compile(OSLCompiler &compiler) diff --git a/intern/cycles/scene/svm.cpp b/intern/cycles/scene/svm.cpp index 4bc5a1b9cc2..ede3f87e7e3 100644 --- a/intern/cycles/scene/svm.cpp +++ b/intern/cycles/scene/svm.cpp @@ -44,8 +44,6 @@ void SVMShaderManager::device_update_shader(Scene *scene, } assert(shader->graph); - svm_nodes->push_back_slow(make_int4(NODE_SHADER_JUMP, 0, 0, 0)); - SVMCompiler::Summary summary; SVMCompiler compiler(scene); compiler.background = (shader == scene->background->get_shader(scene)); @@ -170,6 +168,9 @@ SVMCompiler::SVMCompiler(Scene *scene) : scene(scene) background = false; mix_weight_offset = SVM_STACK_INVALID; compile_failed = false; + + /* This struct has one entry for every node, in order of ShaderNodeType definition. */ + svm_node_types_used = (std::atomic_int *)&scene->dscene.data.svm_usage; } int SVMCompiler::stack_size(SocketType::Type type) @@ -378,11 +379,13 @@ void SVMCompiler::add_node(int a, int b, int c, int d) void SVMCompiler::add_node(ShaderNodeType type, int a, int b, int c) { + svm_node_types_used[type] = true; current_svm_nodes.push_back_slow(make_int4(type, a, b, c)); } void SVMCompiler::add_node(ShaderNodeType type, const float3 &f) { + svm_node_types_used[type] = true; current_svm_nodes.push_back_slow( make_int4(type, __float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z))); } @@ -663,6 +666,7 @@ void SVMCompiler::generate_multi_closure(ShaderNode *root_node, /* Add instruction to skip closure and its dependencies if mix * weight is zero. */ + svm_node_types_used[NODE_JUMP_IF_ONE] = true; current_svm_nodes.push_back_slow(make_int4(NODE_JUMP_IF_ONE, 0, stack_assign(facin), 0)); int node_jump_skip_index = current_svm_nodes.size() - 1; @@ -678,6 +682,7 @@ void SVMCompiler::generate_multi_closure(ShaderNode *root_node, /* Add instruction to skip closure and its dependencies if mix * weight is zero. */ + svm_node_types_used[NODE_JUMP_IF_ZERO] = true; current_svm_nodes.push_back_slow(make_int4(NODE_JUMP_IF_ZERO, 0, stack_assign(facin), 0)); int node_jump_skip_index = current_svm_nodes.size() - 1; @@ -844,6 +849,9 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty void SVMCompiler::compile(Shader *shader, array<int4> &svm_nodes, int index, Summary *summary) { + svm_node_types_used[NODE_SHADER_JUMP] = true; + svm_nodes.push_back_slow(make_int4(NODE_SHADER_JUMP, 0, 0, 0)); + /* copy graph for shader with bump mapping */ ShaderNode *output = shader->graph->output(); int start_num_svm_nodes = svm_nodes.size(); diff --git a/intern/cycles/scene/svm.h b/intern/cycles/scene/svm.h index 19746616207..f72375e7f87 100644 --- a/intern/cycles/scene/svm.h +++ b/intern/cycles/scene/svm.h @@ -211,6 +211,7 @@ class SVMCompiler { /* compile */ void compile_type(Shader *shader, ShaderGraph *graph, ShaderType type); + std::atomic_int *svm_node_types_used; array<int4> current_svm_nodes; ShaderType current_type; Shader *current_shader; diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h index b0de0b25a45..c5b1cd51030 100644 --- a/intern/cycles/util/math_intersect.h +++ b/intern/cycles/util/math_intersect.h @@ -10,7 +10,8 @@ CCL_NAMESPACE_BEGIN ccl_device bool ray_sphere_intersect(float3 ray_P, float3 ray_D, - float ray_t, + float ray_tmin, + float ray_tmax, float3 sphere_P, float sphere_radius, ccl_private float3 *isect_P, @@ -33,7 +34,7 @@ ccl_device bool ray_sphere_intersect(float3 ray_P, return false; } const float t = tp - sqrtf(radiussq - dsq); /* pythagoras */ - if (t < ray_t) { + if (t > ray_tmin && t < ray_tmax) { *isect_t = t; *isect_P = ray_P + ray_D * t; return true; @@ -44,7 +45,8 @@ ccl_device bool ray_sphere_intersect(float3 ray_P, ccl_device bool ray_aligned_disk_intersect(float3 ray_P, float3 ray_D, - float ray_t, + float ray_tmin, + float ray_tmax, float3 disk_P, float disk_radius, ccl_private float3 *isect_P, @@ -59,7 +61,7 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P, } /* Compute t to intersection point. */ const float t = -disk_t / div; - if (t < 0.0f || t > ray_t) { + if (!(t > ray_tmin && t < ray_tmax)) { return false; } /* Test if within radius. */ @@ -74,7 +76,8 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P, ccl_device bool ray_disk_intersect(float3 ray_P, float3 ray_D, - float ray_t, + float ray_tmin, + float ray_tmax, float3 disk_P, float3 disk_N, float disk_radius, @@ -92,7 +95,8 @@ ccl_device bool ray_disk_intersect(float3 ray_P, } float3 P = ray_P + t * ray_D; float3 T = P - disk_P; - if (dot(T, T) < sqr(disk_radius) /*&& t > 0.f*/ && t <= ray_t) { + + if (dot(T, T) < sqr(disk_radius) && (t > ray_tmin && t < ray_tmax)) { *isect_P = ray_P + t * ray_D; *isect_t = t; return true; @@ -103,7 +107,8 @@ ccl_device bool ray_disk_intersect(float3 ray_P, ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, float3 ray_dir, - float ray_t, + float ray_tmin, + float ray_tmax, const float3 tri_a, const float3 tri_b, const float3 tri_c, @@ -149,16 +154,14 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, /* Perform depth test. */ const float T = dot3(v0, Ng); - const int sign_den = (__float_as_int(den) & 0x80000000); - const float sign_T = xor_signmask(T, sign_den); - if ((sign_T < 0.0f) || (sign_T > ray_t * xor_signmask(den, sign_den))) { + const float t = T / den; + if (!(t >= ray_tmin && t <= ray_tmax)) { return false; } - const float inv_den = 1.0f / den; - *isect_u = U * inv_den; - *isect_v = V * inv_den; - *isect_t = T * inv_den; + *isect_u = U / den; + *isect_v = V / den; + *isect_t = t; return true; #undef dot3 @@ -171,8 +174,8 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, */ ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, - float ray_mint, - float ray_maxt, + float ray_tmin, + float ray_tmax, float3 quad_P, float3 quad_u, float3 quad_v, @@ -185,7 +188,7 @@ ccl_device bool ray_quad_intersect(float3 ray_P, { /* Perform intersection test. */ float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n); - if (t < ray_mint || t > ray_maxt) { + if (!(t > ray_tmin && t < ray_tmax)) { return false; } const float3 hit = ray_P + t * ray_D; diff --git a/intern/cycles/util/string.cpp b/intern/cycles/util/string.cpp index 66ff866ee10..0c318cea44a 100644 --- a/intern/cycles/util/string.cpp +++ b/intern/cycles/util/string.cpp @@ -136,6 +136,19 @@ void string_replace(string &haystack, const string &needle, const string &other) } } +void string_replace_same_length(string &haystack, const string &needle, const string &other) +{ + assert(needle.size() == other.size()); + size_t pos = 0; + while (pos != string::npos) { + pos = haystack.find(needle, pos); + if (pos != string::npos) { + memcpy(haystack.data() + pos, other.data(), other.size()); + pos += other.size(); + } + } +} + string string_remove_trademark(const string &s) { string result = s; @@ -164,6 +177,11 @@ string to_string(const char *str) return string(str); } +string to_string(const float4 &v) +{ + return string_printf("%f,%f,%f,%f", v.x, v.y, v.z, v.w); +} + string string_to_lower(const string &s) { string r = s; diff --git a/intern/cycles/util/string.h b/intern/cycles/util/string.h index a74feee1750..ecbe9e106c6 100644 --- a/intern/cycles/util/string.h +++ b/intern/cycles/util/string.h @@ -38,12 +38,14 @@ void string_split(vector<string> &tokens, const string &separators = "\t ", bool skip_empty_tokens = true); void string_replace(string &haystack, const string &needle, const string &other); +void string_replace_same_length(string &haystack, const string &needle, const string &other); bool string_startswith(string_view s, string_view start); bool string_endswith(string_view s, string_view end); string string_strip(const string &s); string string_remove_trademark(const string &s); string string_from_bool(const bool var); string to_string(const char *str); +string to_string(const float4 &v); string string_to_lower(const string &s); /* Wide char strings are only used on Windows to deal with non-ASCII |