diff options
Diffstat (limited to 'intern/cycles')
137 files changed, 4112 insertions, 11813 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 121c8bdad6e..e5a5e9773d3 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -286,6 +286,7 @@ include_directories( ${OPENEXR_INCLUDE_DIR} ${OPENEXR_INCLUDE_DIRS} ${PUGIXML_INCLUDE_DIR} + ${TBB_INCLUDE_DIRS} ) if(CYCLES_STANDALONE_REPOSITORY) diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt index 496e8e9310b..2316800e21e 100644 --- a/intern/cycles/blender/CMakeLists.txt +++ b/intern/cycles/blender/CMakeLists.txt @@ -92,10 +92,6 @@ if(WITH_MOD_FLUID) add_definitions(-DWITH_FLUID) endif() -if(WITH_NEW_OBJECT_TYPES) - add_definitions(-DWITH_NEW_OBJECT_TYPES) -endif() - if(WITH_OPENVDB) add_definitions(-DWITH_OPENVDB ${OPENVDB_DEFINITIONS}) list(APPEND INC_SYS @@ -106,6 +102,13 @@ if(WITH_OPENVDB) ) endif() +if(WITH_OPENIMAGEDENOISE) + add_definitions(-DWITH_OPENIMAGEDENOISE) + list(APPEND INC_SYS + ${OPENIMAGEDENOISE_INCLUDE_DIRS} + ) +endif() + blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}" "${LIB}") # avoid link failure with clang 3.4 debug diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index e7ea5e7a1f6..7566ca28dd7 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -179,7 +179,8 @@ def reset(engine, data, depsgraph): import _cycles import bpy - if bpy.app.debug_value == 256: + prefs = bpy.context.preferences + if prefs.experimental.use_cycles_debug and prefs.view.show_developer_ui: _cycles.debug_flags_update(depsgraph.scene.as_pointer()) else: _cycles.debug_flags_reset() diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 1635afab210..840efb65d96 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -55,8 +55,7 @@ enum_displacement_methods = ( enum_bvh_layouts = ( ('BVH2', "BVH2", "", 1), - ('BVH4', "BVH4", "", 2), - ('BVH8', "BVH8", "", 4), + ('EMBREE', "Embree", "", 4), ) enum_bvh_types = ( @@ -78,20 +77,9 @@ enum_panorama_types = ( ('MIRRORBALL', "Mirror Ball", "Uses the mirror ball mapping"), ) -enum_curve_primitives = ( - ('TRIANGLES', "Triangles", "Create triangle geometry around strands"), - ('LINE_SEGMENTS', "Line Segments", "Use line segment primitives"), - ('CURVE_SEGMENTS', "Curve Segments", "Use segmented cardinal curve primitives"), -) - -enum_triangle_curves = ( - ('CAMERA_TRIANGLES', "Planes", "Create individual triangles forming planes that face camera"), - ('TESSELLATED_TRIANGLES', "Tessellated", "Create mesh surrounding each strand"), -) - enum_curve_shape = ( - ('RIBBONS', "Ribbons", "Ignore thickness of each strand"), - ('THICK', "Thick", "Use thickness of strand when rendering"), + ('RIBBONS', "Rounded Ribbons", "Render hair as flat ribbon with rounded normals, for fast rendering"), + ('THICK', "3D Curves", "Render hair as 3D curve, for accurate results when viewing hair close up"), ) enum_tile_order = ( @@ -194,10 +182,36 @@ enum_aov_types = ( ('COLOR', "Color", "Write a Color pass", 1), ) -enum_viewport_denoising = ( - ('NONE', "None", "Disable viewport denoising", 0), - ('OPTIX', "OptiX AI-Accelerated", "Use the OptiX denoiser running on the GPU (requires at least one compatible OptiX device)", 1), -) +def enum_openimagedenoise_denoiser(self, context): + if _cycles.with_openimagedenoise: + return [('OPENIMAGEDENOISE', "OpenImageDenoise", "Use Intel OpenImageDenoise AI denoiser running on the CPU", 4)] + return [] + +def enum_optix_denoiser(self, context): + if not context or bool(context.preferences.addons[__package__].preferences.get_devices_for_type('OPTIX')): + return [('OPTIX', "OptiX", "Use the OptiX AI denoiser with GPU acceleration, only available on NVIDIA GPUs", 2)] + return [] + +def enum_preview_denoiser(self, context): + optix_items = enum_optix_denoiser(self, context) + oidn_items = enum_openimagedenoise_denoiser(self, context) + + if len(optix_items): + auto_label = "Fastest (Optix)" + elif len(oidn_items): + auto_label = "Fastest (OpenImageDenoise)" + else: + auto_label = "None" + + items = [('AUTO', auto_label, "Use the fastest available denoiser for viewport rendering", 0)] + items += optix_items + items += oidn_items + return items + +def enum_denoiser(self, context): + items = [('NLM', "NLM", "Cycles native non-local means denoiser, running on any compute device", 1)] + items += enum_optix_denoiser(self, context) + return items enum_denoising_optix_input_passes = ( ('RGB', "Color", "Use only color as input", 1), @@ -236,11 +250,29 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): description="Pause all viewport preview renders", default=False, ) - preview_denoising: EnumProperty( - name="Viewport Denoising", - description="Denoise the image after each preview update with the selected denoiser engine", - items=enum_viewport_denoising, - default='NONE', + + use_denoising: BoolProperty( + name="Use Denoising", + description="Denoise the rendered image", + default=False, + ) + use_preview_denoising: BoolProperty( + name="Use Viewport Denoising", + description="Denoise the image in the 3D viewport", + default=False, + ) + + denoiser: EnumProperty( + name="Denoiser", + description="Denoise the image with the selected denoiser", + items=enum_denoiser, + default=1, + ) + preview_denoiser: EnumProperty( + name="Viewport Denoiser", + description="Denoise the image after each preview update with the selected denoiser", + items=enum_preview_denoiser, + default=0, ) use_square_samples: BoolProperty( @@ -256,7 +288,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): default=128, ) preview_samples: IntProperty( - name="Preview Samples", + name="Viewport Samples", description="Number of samples to render in the viewport, unlimited if 0", min=0, max=(1 << 24), default=32, @@ -476,7 +508,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): subtype='PIXEL' ) preview_dicing_rate: FloatProperty( - name="Preview Dicing Rate", + name="Viewport Dicing Rate", description="Size of a micropolygon in pixels during preview render", min=0.1, max=1000.0, soft_min=0.5, default=8.0, @@ -629,11 +661,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): items=enum_bvh_types, default='DYNAMIC_BVH', ) - use_bvh_embree: BoolProperty( - name="Use Embree", - description="Use Embree as ray accelerator", - default=False, - ) debug_use_spatial_splits: BoolProperty( name="Use Spatial Splits", description="Use BVH spatial splits: longer builder time, faster render", @@ -786,7 +813,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): debug_bvh_layout: EnumProperty( name="BVH Layout", items=enum_bvh_layouts, - default='BVH8', + default='EMBREE', ) debug_use_cpu_split_kernel: BoolProperty(name="Split Kernel", default=False) @@ -1241,39 +1268,17 @@ class CyclesObjectSettings(bpy.types.PropertyGroup): class CyclesCurveRenderSettings(bpy.types.PropertyGroup): - primitive: EnumProperty( - name="Primitive", - description="Type of primitive used for hair rendering", - items=enum_curve_primitives, - default='LINE_SEGMENTS', - ) shape: EnumProperty( name="Shape", description="Form of hair", items=enum_curve_shape, - default='THICK', - ) - cull_backfacing: BoolProperty( - name="Cull Back-faces", - description="Do not test the back-face of each strand", - default=True, - ) - use_curves: BoolProperty( - name="Use Cycles Hair Rendering", - description="Activate Cycles hair rendering for particle system", - default=True, - ) - resolution: IntProperty( - name="Resolution", - description="Resolution of generated mesh", - min=3, max=64, - default=3, + default='RIBBONS', ) subdivisions: IntProperty( name="Subdivisions", description="Number of subdivisions used in Cardinal curve intersection (power of 2)", min=0, max=24, - default=4, + default=2, ) @classmethod @@ -1369,7 +1374,7 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup): use_denoising: BoolProperty( name="Use Denoising", description="Denoise the rendered image", - default=False, + default=True, update=update_render_passes, ) denoising_diffuse_direct: BoolProperty( @@ -1439,12 +1444,6 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup): default=0, ) - use_optix_denoising: BoolProperty( - name="OptiX AI-Accelerated", - description="Use the OptiX denoiser to denoise the rendered image", - default=False, - update=update_render_passes, - ) denoising_optix_input_passes: EnumProperty( name="Input Passes", description="Passes handed over to the OptiX denoiser (this can have different effects on the denoised image)", diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 78a44881743..b049d0bf2b4 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -112,10 +112,6 @@ def show_device_active(context): return True return context.preferences.addons[__package__].preferences.has_active_device() -def show_optix_denoising(context): - # OptiX AI denoiser can be used when at least one device supports OptiX - return bool(context.preferences.addons[__package__].preferences.get_devices_for_type('OPTIX')) - def draw_samples_info(layout, context): cscene = context.scene.cycles @@ -190,11 +186,6 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel): col.prop(cscene, "aa_samples", text="Render") col.prop(cscene, "preview_aa_samples", text="Viewport") - # Viewport denoising is currently only supported with OptiX - if show_optix_denoising(context): - col = layout.column() - col.prop(cscene, "preview_denoising") - if not use_branched_path(context): draw_samples_info(layout, context) @@ -256,6 +247,39 @@ class CYCLES_RENDER_PT_sampling_adaptive(CyclesButtonsPanel, Panel): col.prop(cscene, "adaptive_threshold", text="Noise Threshold") col.prop(cscene, "adaptive_min_samples", text="Min Samples") + +class CYCLES_RENDER_PT_sampling_denoising(CyclesButtonsPanel, Panel): + bl_label = "Denoising" + bl_parent_id = "CYCLES_RENDER_PT_sampling" + bl_options = {'DEFAULT_CLOSED'} + + def draw(self, context): + layout = self.layout + layout.use_property_split = True + layout.use_property_decorate = False + + scene = context.scene + cscene = scene.cycles + + heading = layout.column(align=True, heading="Render") + row = heading.row(align=True) + row.prop(cscene, "use_denoising", text="") + sub = row.row() + sub.active = cscene.use_denoising + sub.prop(cscene, "denoiser", text="") + + heading = layout.column(align=False, heading="Viewport") + row = heading.row(align=True) + row.prop(cscene, "use_preview_denoising", text="") + sub = row.row() + sub.active = cscene.use_preview_denoising + sub.prop(cscene, "preview_denoiser", text="") + + sub = heading.row(align=True) + sub.active = cscene.use_preview_denoising + sub.prop(cscene, "preview_denoising_start_sample", text="Start Sample") + + class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel): bl_label = "Advanced" bl_parent_id = "CYCLES_RENDER_PT_sampling" @@ -387,13 +411,6 @@ class CYCLES_RENDER_PT_hair(CyclesButtonsPanel, Panel): bl_label = "Hair" bl_options = {'DEFAULT_CLOSED'} - def draw_header(self, context): - layout = self.layout - scene = context.scene - ccscene = scene.cycles_curves - - layout.prop(ccscene, "use_curves", text="") - def draw(self, context): layout = self.layout layout.use_property_split = True @@ -402,18 +419,10 @@ class CYCLES_RENDER_PT_hair(CyclesButtonsPanel, Panel): scene = context.scene ccscene = scene.cycles_curves - layout.active = ccscene.use_curves - col = layout.column() col.prop(ccscene, "shape", text="Shape") - if not (ccscene.primitive in {'CURVE_SEGMENTS', 'LINE_SEGMENTS'} and ccscene.shape == 'RIBBONS'): - col.prop(ccscene, "cull_backfacing", text="Cull back-faces") - col.prop(ccscene, "primitive", text="Primitive") - - if ccscene.primitive == 'TRIANGLES' and ccscene.shape == 'THICK': - col.prop(ccscene, "resolution", text="Resolution") - elif ccscene.primitive == 'CURVE_SEGMENTS': - col.prop(ccscene, "subdivisions", text="Curve subdivisions") + if ccscene.shape == 'RIBBONS': + col.prop(ccscene, "subdivisions", text="Curve Subdivisions") class CYCLES_RENDER_PT_volumes(CyclesButtonsPanel, Panel): @@ -693,16 +702,20 @@ class CYCLES_RENDER_PT_performance_acceleration_structure(CyclesButtonsPanel, Pa col = layout.column() - if _cycles.with_embree: - row = col.row() - row.active = use_cpu(context) - row.prop(cscene, "use_bvh_embree") + use_embree = False + if use_cpu(context): + use_embree = _cycles.with_embree + if not use_embree: + sub = col.column(align=True) + sub.label(text="Cycles built without Embree support") + sub.label(text="CPU raytracing performance will be poor") + col.prop(cscene, "debug_use_spatial_splits") sub = col.column() - sub.active = not cscene.use_bvh_embree or not _cycles.with_embree + sub.active = not use_embree sub.prop(cscene, "debug_use_hair_bvh") sub = col.column() - sub.active = not cscene.debug_use_spatial_splits and not cscene.use_bvh_embree + sub.active = not cscene.debug_use_spatial_splits and not use_embree sub.prop(cscene, "debug_bvh_time_steps") @@ -741,11 +754,6 @@ class CYCLES_RENDER_PT_performance_viewport(CyclesButtonsPanel, Panel): col.prop(rd, "preview_pixel_size", text="Pixel Size") col.prop(cscene, "preview_start_resolution", text="Start Pixels") - if show_optix_denoising(context): - sub = col.row(align=True) - sub.active = cscene.preview_denoising != 'NONE' - sub.prop(cscene, "preview_denoising_start_sample", text="Denoising Start Sample") - class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel): bl_label = "Filter" @@ -968,12 +976,17 @@ class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel): bl_context = "view_layer" bl_options = {'DEFAULT_CLOSED'} + @classmethod + def poll(cls, context): + cscene = context.scene.cycles + return CyclesButtonsPanel.poll(context) and cscene.use_denoising + def draw_header(self, context): scene = context.scene view_layer = context.view_layer cycles_view_layer = view_layer.cycles - layout = self.layout + layout = self.layout layout.prop(cycles_view_layer, "use_denoising", text="") def draw(self, context): @@ -984,18 +997,17 @@ class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel): scene = context.scene view_layer = context.view_layer cycles_view_layer = view_layer.cycles + denoiser = scene.cycles.denoiser - layout.active = cycles_view_layer.use_denoising + layout.active = denoiser != 'NONE' and cycles_view_layer.use_denoising col = layout.column() - if show_optix_denoising(context): - col.prop(cycles_view_layer, "use_optix_denoising") - col.separator(factor=2.0) - - if cycles_view_layer.use_optix_denoising: - col.prop(cycles_view_layer, "denoising_optix_input_passes") - return + if denoiser == 'OPTIX': + col.prop(cycles_view_layer, "denoising_optix_input_passes") + return + elif denoiser == 'OPENIMAGEDENOISE': + return col.prop(cycles_view_layer, "denoising_radius", text="Radius") @@ -1190,6 +1202,7 @@ class CYCLES_OBJECT_PT_motion_blur(CyclesButtonsPanel, Panel): def draw(self, context): layout = self.layout + layout.use_property_split = True rd = context.scene.render # scene = context.scene @@ -1199,10 +1212,10 @@ class CYCLES_OBJECT_PT_motion_blur(CyclesButtonsPanel, Panel): layout.active = (rd.use_motion_blur and cob.use_motion_blur) - row = layout.row() + col = layout.column() + col.prop(cob, "motion_steps", text="Steps") if ob.type != 'CAMERA': - row.prop(cob, "use_deform_motion", text="Deformation") - row.prop(cob, "motion_steps", text="Steps") + col.prop(cob, "use_deform_motion", text="Deformation") def has_geometry_visibility(ob): @@ -1575,17 +1588,18 @@ class CYCLES_WORLD_PT_ray_visibility(CyclesButtonsPanel, Panel): def draw(self, context): layout = self.layout + layout.use_property_split = True + layout.use_property_decorate = False world = context.world visibility = world.cycles_visibility - flow = layout.column_flow() - - flow.prop(visibility, "camera") - flow.prop(visibility, "diffuse") - flow.prop(visibility, "glossy") - flow.prop(visibility, "transmission") - flow.prop(visibility, "scatter") + col = layout.column() + col.prop(visibility, "camera") + col.prop(visibility, "diffuse") + col.prop(visibility, "glossy") + col.prop(visibility, "transmission") + col.prop(visibility, "scatter") class CYCLES_WORLD_PT_settings(CyclesButtonsPanel, Panel): @@ -1975,7 +1989,10 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel): @classmethod def poll(cls, context): - return CyclesButtonsPanel.poll(context) and bpy.app.debug_value == 256 + prefs = bpy.context.preferences + return (CyclesButtonsPanel.poll(context) + and prefs.experimental.use_cycles_debug + and prefs.view.show_developer_ui) def draw(self, context): layout = self.layout @@ -2248,6 +2265,7 @@ classes = ( CYCLES_RENDER_PT_sampling, CYCLES_RENDER_PT_sampling_sub_samples, CYCLES_RENDER_PT_sampling_adaptive, + CYCLES_RENDER_PT_sampling_denoising, CYCLES_RENDER_PT_sampling_advanced, CYCLES_RENDER_PT_light_paths, CYCLES_RENDER_PT_light_paths_max_bounces, diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp index 40a1a2c2edc..011678a7a65 100644 --- a/intern/cycles/blender/blender_camera.cpp +++ b/intern/cycles/blender/blender_camera.cpp @@ -867,13 +867,13 @@ void BlenderSync::sync_view(BL::SpaceView3D &b_v3d, } } -BufferParams BlenderSync::get_buffer_params(BL::Scene &b_scene, - BL::RenderSettings &b_render, +BufferParams BlenderSync::get_buffer_params(BL::RenderSettings &b_render, BL::SpaceView3D &b_v3d, BL::RegionView3D &b_rv3d, Camera *cam, int width, - int height) + int height, + const bool use_denoiser) { BufferParams params; bool use_border = false; @@ -907,8 +907,7 @@ BufferParams BlenderSync::get_buffer_params(BL::Scene &b_scene, PassType display_pass = update_viewport_display_passes(b_v3d, params.passes); /* Can only denoise the combined image pass */ - params.denoising_data_pass = display_pass == PASS_COMBINED && - update_viewport_display_denoising(b_v3d, b_scene); + params.denoising_data_pass = display_pass == PASS_COMBINED && use_denoiser; return params; } diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp index 847a43c5f34..82c99631a89 100644 --- a/intern/cycles/blender/blender_curves.cpp +++ b/intern/cycles/blender/blender_curves.cpp @@ -18,7 +18,6 @@ #include "render/camera.h" #include "render/curves.h" #include "render/hair.h" -#include "render/mesh.h" #include "render/object.h" #include "render/scene.h" @@ -39,27 +38,6 @@ ParticleCurveData::~ParticleCurveData() { } -static void interp_weights(float t, float data[4]) -{ - /* Cardinal curve interpolation */ - float t2 = t * t; - float t3 = t2 * t; - float fc = 0.71f; - - data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t; - data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f; - data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t; - data[3] = fc * t3 - fc * t2; -} - -static void curveinterp_v3_v3v3v3v3( - float3 *p, float3 *v1, float3 *v2, float3 *v3, float3 *v4, const float w[4]) -{ - p->x = v1->x * w[0] + v2->x * w[1] + v3->x * w[2] + v4->x * w[3]; - p->y = v1->y * w[0] + v2->y * w[1] + v3->y * w[2] + v4->y * w[3]; - p->z = v1->z * w[0] + v2->z * w[1] + v3->z * w[2] + v4->z * w[3]; -} - static float shaperadius(float shape, float root, float tip, float time) { assert(time >= 0.0f); @@ -77,43 +55,13 @@ static float shaperadius(float shape, float root, float tip, float time) /* curve functions */ -static void InterpolateKeySegments( - int seg, int segno, int key, int curve, float3 *keyloc, float *time, ParticleCurveData *CData) -{ - float3 ckey_loc1 = CData->curvekey_co[key]; - float3 ckey_loc2 = ckey_loc1; - float3 ckey_loc3 = CData->curvekey_co[key + 1]; - float3 ckey_loc4 = ckey_loc3; - - if (key > CData->curve_firstkey[curve]) - ckey_loc1 = CData->curvekey_co[key - 1]; - - if (key < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 2) - ckey_loc4 = CData->curvekey_co[key + 2]; - - float time1 = CData->curvekey_time[key] / CData->curve_length[curve]; - float time2 = CData->curvekey_time[key + 1] / CData->curve_length[curve]; - - float dfra = (time2 - time1) / (float)segno; - - if (time) - *time = (dfra * seg) + time1; - - float t[4]; - - interp_weights((float)seg / (float)segno, t); - - if (keyloc) - curveinterp_v3_v3v3v3v3(keyloc, &ckey_loc1, &ckey_loc2, &ckey_loc3, &ckey_loc4, t); -} - static bool ObtainCacheParticleData( - Geometry *geom, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background) + Hair *hair, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background) { int curvenum = 0; int keyno = 0; - if (!(geom && b_mesh && b_ob && CData)) + if (!(hair && b_mesh && b_ob && CData)) return false; Transform tfm = get_transform(b_ob->matrix_world()); @@ -129,7 +77,7 @@ static bool ObtainCacheParticleData( if ((b_part.render_type() == BL::ParticleSettings::render_type_PATH) && (b_part.type() == BL::ParticleSettings::type_HAIR)) { - int shader = clamp(b_part.material() - 1, 0, geom->used_shaders.size() - 1); + int shader = clamp(b_part.material() - 1, 0, hair->used_shaders.size() - 1); int display_step = background ? b_part.render_step() : b_part.display_step(); int totparts = b_psys.particles.length(); int totchild = background ? b_psys.child_particles.length() : @@ -203,14 +151,14 @@ static bool ObtainCacheParticleData( return true; } -static bool ObtainCacheParticleUV(Geometry *geom, +static bool ObtainCacheParticleUV(Hair *hair, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background, int uv_num) { - if (!(geom && b_mesh && b_ob && CData)) + if (!(hair && b_mesh && b_ob && CData)) return false; CData->curve_uv.clear(); @@ -266,14 +214,14 @@ static bool ObtainCacheParticleUV(Geometry *geom, return true; } -static bool ObtainCacheParticleVcol(Geometry *geom, +static bool ObtainCacheParticleVcol(Hair *hair, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background, int vcol_num) { - if (!(geom && b_mesh && b_ob && CData)) + if (!(hair && b_mesh && b_ob && CData)) return false; CData->curve_vcol.clear(); @@ -314,7 +262,7 @@ static bool ObtainCacheParticleVcol(Geometry *geom, BL::Mesh::vertex_colors_iterator l; b_mesh->vertex_colors.begin(l); - float3 vcol = make_float3(0.0f, 0.0f, 0.0f); + float4 vcol = make_float4(0.0f, 0.0f, 0.0f, 1.0f); if (b_mesh->vertex_colors.length()) b_psys.mcol_on_emitter(psmd, *b_pa, pa_no, vcol_num, &vcol.x); CData->curve_vcol.push_back_slow(vcol); @@ -329,272 +277,6 @@ static bool ObtainCacheParticleVcol(Geometry *geom, return true; } -static void ExportCurveTrianglePlanes(Mesh *mesh, - ParticleCurveData *CData, - float3 RotCam, - bool is_ortho) -{ - int vertexno = mesh->verts.size(); - int vertexindex = vertexno; - int numverts = 0, numtris = 0; - - /* compute and reserve size of arrays */ - for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) { - for (int curve = CData->psys_firstcurve[sys]; - curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; - curve++) { - numverts += 2 + (CData->curve_keynum[curve] - 1) * 2; - numtris += (CData->curve_keynum[curve] - 1) * 2; - } - } - - mesh->reserve_mesh(mesh->verts.size() + numverts, mesh->num_triangles() + numtris); - - /* actually export */ - for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) { - for (int curve = CData->psys_firstcurve[sys]; - curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; - curve++) { - float3 xbasis; - float3 v1; - float time = 0.0f; - float3 ickey_loc = CData->curvekey_co[CData->curve_firstkey[curve]]; - float radius = shaperadius( - CData->psys_shape[sys], CData->psys_rootradius[sys], CData->psys_tipradius[sys], 0.0f); - v1 = CData->curvekey_co[CData->curve_firstkey[curve] + 1] - - CData->curvekey_co[CData->curve_firstkey[curve]]; - if (is_ortho) - xbasis = normalize(cross(RotCam, v1)); - else - xbasis = normalize(cross(RotCam - ickey_loc, v1)); - float3 ickey_loc_shfl = ickey_loc - radius * xbasis; - float3 ickey_loc_shfr = ickey_loc + radius * xbasis; - mesh->add_vertex(ickey_loc_shfl); - mesh->add_vertex(ickey_loc_shfr); - vertexindex += 2; - - for (int curvekey = CData->curve_firstkey[curve] + 1; - curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve]; - curvekey++) { - ickey_loc = CData->curvekey_co[curvekey]; - - if (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1) - v1 = CData->curvekey_co[curvekey] - - CData->curvekey_co[max(curvekey - 1, CData->curve_firstkey[curve])]; - else - v1 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey - 1]; - - time = CData->curvekey_time[curvekey] / CData->curve_length[curve]; - radius = shaperadius( - CData->psys_shape[sys], CData->psys_rootradius[sys], CData->psys_tipradius[sys], time); - - if (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1) - radius = shaperadius(CData->psys_shape[sys], - CData->psys_rootradius[sys], - CData->psys_tipradius[sys], - 0.95f); - - if (CData->psys_closetip[sys] && - (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)) - radius = shaperadius(CData->psys_shape[sys], CData->psys_rootradius[sys], 0.0f, 0.95f); - - if (is_ortho) - xbasis = normalize(cross(RotCam, v1)); - else - xbasis = normalize(cross(RotCam - ickey_loc, v1)); - float3 ickey_loc_shfl = ickey_loc - radius * xbasis; - float3 ickey_loc_shfr = ickey_loc + radius * xbasis; - mesh->add_vertex(ickey_loc_shfl); - mesh->add_vertex(ickey_loc_shfr); - mesh->add_triangle( - vertexindex - 2, vertexindex, vertexindex - 1, CData->psys_shader[sys], true); - mesh->add_triangle( - vertexindex + 1, vertexindex - 1, vertexindex, CData->psys_shader[sys], true); - vertexindex += 2; - } - } - } - - mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles()); - mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL); - mesh->attributes.remove(ATTR_STD_FACE_NORMAL); - mesh->add_face_normals(); - mesh->add_vertex_normals(); - mesh->attributes.remove(ATTR_STD_FACE_NORMAL); - - /* texture coords still needed */ -} - -static void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resolution) -{ - int vertexno = mesh->verts.size(); - int vertexindex = vertexno; - int numverts = 0, numtris = 0; - - /* compute and reserve size of arrays */ - for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) { - for (int curve = CData->psys_firstcurve[sys]; - curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; - curve++) { - numverts += (CData->curve_keynum[curve] - 1) * resolution + resolution; - numtris += (CData->curve_keynum[curve] - 1) * 2 * resolution; - } - } - - mesh->reserve_mesh(mesh->verts.size() + numverts, mesh->num_triangles() + numtris); - - /* actually export */ - for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) { - for (int curve = CData->psys_firstcurve[sys]; - curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; - curve++) { - float3 firstxbasis = cross(make_float3(1.0f, 0.0f, 0.0f), - CData->curvekey_co[CData->curve_firstkey[curve] + 1] - - CData->curvekey_co[CData->curve_firstkey[curve]]); - if (!is_zero(firstxbasis)) - firstxbasis = normalize(firstxbasis); - else - firstxbasis = normalize(cross(make_float3(0.0f, 1.0f, 0.0f), - CData->curvekey_co[CData->curve_firstkey[curve] + 1] - - CData->curvekey_co[CData->curve_firstkey[curve]])); - - for (int curvekey = CData->curve_firstkey[curve]; - curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1; - curvekey++) { - float3 xbasis = firstxbasis; - float3 v1; - float3 v2; - - if (curvekey == CData->curve_firstkey[curve]) { - v1 = CData->curvekey_co[min( - curvekey + 2, CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)] - - CData->curvekey_co[curvekey + 1]; - v2 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey]; - } - else if (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1) { - v1 = CData->curvekey_co[curvekey] - CData->curvekey_co[curvekey - 1]; - v2 = CData->curvekey_co[curvekey - 1] - - CData->curvekey_co[max(curvekey - 2, CData->curve_firstkey[curve])]; - } - else { - v1 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey]; - v2 = CData->curvekey_co[curvekey] - CData->curvekey_co[curvekey - 1]; - } - - xbasis = cross(v1, v2); - - if (len_squared(xbasis) >= 0.05f * len_squared(v1) * len_squared(v2)) { - firstxbasis = normalize(xbasis); - break; - } - } - - for (int curvekey = CData->curve_firstkey[curve]; - curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1; - curvekey++) { - int subv = 1; - float3 xbasis; - float3 ybasis; - float3 v1; - float3 v2; - - if (curvekey == CData->curve_firstkey[curve]) { - subv = 0; - v1 = CData->curvekey_co[min( - curvekey + 2, CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)] - - CData->curvekey_co[curvekey + 1]; - v2 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey]; - } - else if (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1) { - v1 = CData->curvekey_co[curvekey] - CData->curvekey_co[curvekey - 1]; - v2 = CData->curvekey_co[curvekey - 1] - - CData->curvekey_co[max(curvekey - 2, CData->curve_firstkey[curve])]; - } - else { - v1 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey]; - v2 = CData->curvekey_co[curvekey] - CData->curvekey_co[curvekey - 1]; - } - - xbasis = cross(v1, v2); - - if (len_squared(xbasis) >= 0.05f * len_squared(v1) * len_squared(v2)) { - xbasis = normalize(xbasis); - firstxbasis = xbasis; - } - else - xbasis = firstxbasis; - - ybasis = normalize(cross(xbasis, v2)); - - for (; subv <= 1; subv++) { - float3 ickey_loc = make_float3(0.0f, 0.0f, 0.0f); - float time = 0.0f; - - InterpolateKeySegments(subv, 1, curvekey, curve, &ickey_loc, &time, CData); - - float radius = shaperadius(CData->psys_shape[sys], - CData->psys_rootradius[sys], - CData->psys_tipradius[sys], - time); - - if ((curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 2) && - (subv == 1)) - radius = shaperadius(CData->psys_shape[sys], - CData->psys_rootradius[sys], - CData->psys_tipradius[sys], - 0.95f); - - if (CData->psys_closetip[sys] && (subv == 1) && - (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 2)) - radius = shaperadius(CData->psys_shape[sys], CData->psys_rootradius[sys], 0.0f, 0.95f); - - float angle = M_2PI_F / (float)resolution; - for (int section = 0; section < resolution; section++) { - float3 ickey_loc_shf = ickey_loc + radius * (cosf(angle * section) * xbasis + - sinf(angle * section) * ybasis); - mesh->add_vertex(ickey_loc_shf); - } - - if (subv != 0) { - for (int section = 0; section < resolution - 1; section++) { - mesh->add_triangle(vertexindex - resolution + section, - vertexindex + section, - vertexindex - resolution + section + 1, - CData->psys_shader[sys], - true); - mesh->add_triangle(vertexindex + section + 1, - vertexindex - resolution + section + 1, - vertexindex + section, - CData->psys_shader[sys], - true); - } - mesh->add_triangle(vertexindex - 1, - vertexindex + resolution - 1, - vertexindex - resolution, - CData->psys_shader[sys], - true); - mesh->add_triangle(vertexindex, - vertexindex - resolution, - vertexindex + resolution - 1, - CData->psys_shader[sys], - true); - } - vertexindex += resolution; - } - } - } - } - - mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles()); - mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL); - mesh->attributes.remove(ATTR_STD_FACE_NORMAL); - mesh->add_face_normals(); - mesh->add_vertex_normals(); - mesh->attributes.remove(ATTR_STD_FACE_NORMAL); - - /* texture coords still needed */ -} - static void ExportCurveSegments(Scene *scene, Hair *hair, ParticleCurveData *CData) { int num_keys = 0; @@ -823,154 +505,8 @@ static void ExportCurveSegmentsMotion(Hair *hair, ParticleCurveData *CData, int } } -static void ExportCurveTriangleUV(ParticleCurveData *CData, int resol, float2 *uvdata) -{ - if (uvdata == NULL) - return; - int vertexindex = 0; - - for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) { - for (int curve = CData->psys_firstcurve[sys]; - curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; - curve++) { - for (int curvekey = CData->curve_firstkey[curve]; - curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1; - curvekey++) { - for (int section = 0; section < resol; section++) { - uvdata[vertexindex] = CData->curve_uv[curve]; - vertexindex++; - uvdata[vertexindex] = CData->curve_uv[curve]; - vertexindex++; - uvdata[vertexindex] = CData->curve_uv[curve]; - vertexindex++; - uvdata[vertexindex] = CData->curve_uv[curve]; - vertexindex++; - uvdata[vertexindex] = CData->curve_uv[curve]; - vertexindex++; - uvdata[vertexindex] = CData->curve_uv[curve]; - vertexindex++; - } - } - } - } -} - -static void ExportCurveTriangleVcol(ParticleCurveData *CData, int resol, uchar4 *cdata) -{ - if (cdata == NULL) - return; - - int vertexindex = 0; - - for (int sys = 0; sys < CData->psys_firstcurve.size(); sys++) { - for (int curve = CData->psys_firstcurve[sys]; - curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; - curve++) { - for (int curvekey = CData->curve_firstkey[curve]; - curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1; - curvekey++) { - for (int section = 0; section < resol; section++) { - /* Encode vertex color using the sRGB curve. */ - cdata[vertexindex] = color_float_to_byte( - color_srgb_to_linear_v3(CData->curve_vcol[curve])); - vertexindex++; - cdata[vertexindex] = color_float_to_byte( - color_srgb_to_linear_v3(CData->curve_vcol[curve])); - vertexindex++; - cdata[vertexindex] = color_float_to_byte( - color_srgb_to_linear_v3(CData->curve_vcol[curve])); - vertexindex++; - cdata[vertexindex] = color_float_to_byte( - color_srgb_to_linear_v3(CData->curve_vcol[curve])); - vertexindex++; - cdata[vertexindex] = color_float_to_byte( - color_srgb_to_linear_v3(CData->curve_vcol[curve])); - vertexindex++; - cdata[vertexindex] = color_float_to_byte( - color_srgb_to_linear_v3(CData->curve_vcol[curve])); - vertexindex++; - } - } - } - } -} - /* Hair Curve Sync */ -void BlenderSync::sync_curve_settings(BL::Depsgraph &b_depsgraph) -{ - PointerRNA csscene = RNA_pointer_get(&b_scene.ptr, "cycles_curves"); - - CurveSystemManager *curve_system_manager = scene->curve_system_manager; - CurveSystemManager prev_curve_system_manager = *curve_system_manager; - - curve_system_manager->use_curves = get_boolean(csscene, "use_curves"); - - curve_system_manager->primitive = (CurvePrimitiveType)get_enum( - csscene, "primitive", CURVE_NUM_PRIMITIVE_TYPES, CURVE_LINE_SEGMENTS); - curve_system_manager->curve_shape = (CurveShapeType)get_enum( - csscene, "shape", CURVE_NUM_SHAPE_TYPES, CURVE_THICK); - curve_system_manager->resolution = get_int(csscene, "resolution"); - curve_system_manager->subdivisions = get_int(csscene, "subdivisions"); - curve_system_manager->use_backfacing = !get_boolean(csscene, "cull_backfacing"); - - /* Triangles */ - if (curve_system_manager->primitive == CURVE_TRIANGLES) { - /* camera facing planes */ - if (curve_system_manager->curve_shape == CURVE_RIBBON) { - curve_system_manager->triangle_method = CURVE_CAMERA_TRIANGLES; - curve_system_manager->resolution = 1; - } - else if (curve_system_manager->curve_shape == CURVE_THICK) { - curve_system_manager->triangle_method = CURVE_TESSELATED_TRIANGLES; - } - } - /* Line Segments */ - else if (curve_system_manager->primitive == CURVE_LINE_SEGMENTS) { - if (curve_system_manager->curve_shape == CURVE_RIBBON) { - /* tangent shading */ - curve_system_manager->line_method = CURVE_UNCORRECTED; - curve_system_manager->use_encasing = true; - curve_system_manager->use_backfacing = false; - curve_system_manager->use_tangent_normal_geometry = true; - } - else if (curve_system_manager->curve_shape == CURVE_THICK) { - curve_system_manager->line_method = CURVE_ACCURATE; - curve_system_manager->use_encasing = false; - curve_system_manager->use_tangent_normal_geometry = false; - } - } - /* Curve Segments */ - else if (curve_system_manager->primitive == CURVE_SEGMENTS) { - if (curve_system_manager->curve_shape == CURVE_RIBBON) { - curve_system_manager->primitive = CURVE_RIBBONS; - curve_system_manager->use_backfacing = false; - } - } - - if (curve_system_manager->modified_mesh(prev_curve_system_manager)) { - BL::Depsgraph::objects_iterator b_ob; - - for (b_depsgraph.objects.begin(b_ob); b_ob != b_data.objects.end(); ++b_ob) { - if (object_is_mesh(*b_ob)) { - BL::Object::particle_systems_iterator b_psys; - for (b_ob->particle_systems.begin(b_psys); b_psys != b_ob->particle_systems.end(); - ++b_psys) { - if ((b_psys->settings().render_type() == BL::ParticleSettings::render_type_PATH) && - (b_psys->settings().type() == BL::ParticleSettings::type_HAIR)) { - BL::ID key = BKE_object_is_modified(*b_ob) ? *b_ob : b_ob->data(); - geometry_map.set_recalc(key); - object_map.set_recalc(*b_ob); - } - } - } - } - } - - if (curve_system_manager->modified(prev_curve_system_manager)) - curve_system_manager->tag_update(scene); -} - bool BlenderSync::object_has_particle_hair(BL::Object b_ob) { /* Test if the object has a particle modifier with hair. */ @@ -994,78 +530,38 @@ bool BlenderSync::object_has_particle_hair(BL::Object b_ob) /* Old particle hair. */ void BlenderSync::sync_particle_hair( - Geometry *geom, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step) + Hair *hair, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step) { - Hair *hair = (geom->type == Geometry::HAIR) ? static_cast<Hair *>(geom) : NULL; - Mesh *mesh = (geom->type == Geometry::MESH) ? static_cast<Mesh *>(geom) : NULL; - /* obtain general settings */ if (b_ob.mode() == b_ob.mode_PARTICLE_EDIT || b_ob.mode() == b_ob.mode_EDIT) { return; } - const int triangle_method = scene->curve_system_manager->triangle_method; - const int resolution = scene->curve_system_manager->resolution; - int used_res = 1; - /* extract particle hair data - should be combined with connecting to mesh later*/ ParticleCurveData CData; - ObtainCacheParticleData(geom, &b_mesh, &b_ob, &CData, !preview); - - /* add hair geometry to mesh */ - if (mesh) { - if (triangle_method == CURVE_CAMERA_TRIANGLES) { - /* obtain camera parameters */ - float3 RotCam; - Camera *camera = scene->camera; - Transform &ctfm = camera->matrix; - if (camera->type == CAMERA_ORTHOGRAPHIC) { - RotCam = -make_float3(ctfm.x.z, ctfm.y.z, ctfm.z.z); - } - else { - Transform tfm = get_transform(b_ob.matrix_world()); - Transform itfm = transform_quick_inverse(tfm); - RotCam = transform_point(&itfm, make_float3(ctfm.x.w, ctfm.y.w, ctfm.z.w)); - } - bool is_ortho = camera->type == CAMERA_ORTHOGRAPHIC; - ExportCurveTrianglePlanes(mesh, &CData, RotCam, is_ortho); - } - else { - ExportCurveTriangleGeometry(mesh, &CData, resolution); - used_res = resolution; - } - } - else { - if (motion) - ExportCurveSegmentsMotion(hair, &CData, motion_step); - else - ExportCurveSegments(scene, hair, &CData); - } + ObtainCacheParticleData(hair, &b_mesh, &b_ob, &CData, !preview); + + /* add hair geometry */ + if (motion) + ExportCurveSegmentsMotion(hair, &CData, motion_step); + else + ExportCurveSegments(scene, hair, &CData); /* generated coordinates from first key. we should ideally get this from * blender to handle deforming objects */ if (!motion) { - if (geom->need_attribute(scene, ATTR_STD_GENERATED)) { + if (hair->need_attribute(scene, ATTR_STD_GENERATED)) { float3 loc, size; mesh_texture_space(b_mesh, loc, size); - if (mesh) { - Attribute *attr_generated = mesh->attributes.add(ATTR_STD_GENERATED); - float3 *generated = attr_generated->data_float3(); - - for (size_t i = 0; i < mesh->verts.size(); i++) - generated[i] = mesh->verts[i] * size - loc; - } - else { - Attribute *attr_generated = hair->attributes.add(ATTR_STD_GENERATED); - float3 *generated = attr_generated->data_float3(); + Attribute *attr_generated = hair->attributes.add(ATTR_STD_GENERATED); + float3 *generated = attr_generated->data_float3(); - for (size_t i = 0; i < hair->num_curves(); i++) { - float3 co = hair->curve_keys[hair->get_curve(i).first_key]; - generated[i] = co * size - loc; - } + for (size_t i = 0; i < hair->num_curves(); i++) { + float3 co = hair->curve_keys[hair->get_curve(i).first_key]; + generated[i] = co * size - loc; } } } @@ -1076,32 +572,22 @@ void BlenderSync::sync_particle_hair( int vcol_num = 0; for (b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l, vcol_num++) { - if (!geom->need_attribute(scene, ustring(l->name().c_str()))) + if (!hair->need_attribute(scene, ustring(l->name().c_str()))) continue; - ObtainCacheParticleVcol(geom, &b_mesh, &b_ob, &CData, !preview, vcol_num); + ObtainCacheParticleVcol(hair, &b_mesh, &b_ob, &CData, !preview, vcol_num); - if (mesh) { - Attribute *attr_vcol = mesh->attributes.add( - ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CORNER_BYTE); + Attribute *attr_vcol = hair->attributes.add( + ustring(l->name().c_str()), TypeRGBA, ATTR_ELEMENT_CURVE); - uchar4 *cdata = attr_vcol->data_uchar4(); + float4 *fdata = attr_vcol->data_float4(); - ExportCurveTriangleVcol(&CData, used_res, cdata); - } - else { - Attribute *attr_vcol = hair->attributes.add( - ustring(l->name().c_str()), TypeDesc::TypeColor, ATTR_ELEMENT_CURVE); - - float3 *fdata = attr_vcol->data_float3(); + if (fdata) { + size_t i = 0; - if (fdata) { - size_t i = 0; - - /* Encode vertex color using the sRGB curve. */ - for (size_t curve = 0; curve < CData.curve_vcol.size(); curve++) { - fdata[i++] = color_srgb_to_linear_v3(CData.curve_vcol[curve]); - } + /* Encode vertex color using the sRGB curve. */ + for (size_t curve = 0; curve < CData.curve_vcol.size(); curve++) { + fdata[i++] = color_srgb_to_linear_v4(CData.curve_vcol[curve]); } } } @@ -1118,35 +604,23 @@ void BlenderSync::sync_particle_hair( ustring name = ustring(l->name().c_str()); /* UV map */ - if (geom->need_attribute(scene, name) || geom->need_attribute(scene, std)) { + if (hair->need_attribute(scene, name) || hair->need_attribute(scene, std)) { Attribute *attr_uv; - ObtainCacheParticleUV(geom, &b_mesh, &b_ob, &CData, !preview, uv_num); + ObtainCacheParticleUV(hair, &b_mesh, &b_ob, &CData, !preview, uv_num); - if (mesh) { - if (active_render) - attr_uv = mesh->attributes.add(std, name); - else - attr_uv = mesh->attributes.add(name, TypeFloat2, ATTR_ELEMENT_CORNER); - - float2 *uv = attr_uv->data_float2(); - - ExportCurveTriangleUV(&CData, used_res, uv); - } - else { - if (active_render) - attr_uv = hair->attributes.add(std, name); - else - attr_uv = hair->attributes.add(name, TypeFloat2, ATTR_ELEMENT_CURVE); + if (active_render) + attr_uv = hair->attributes.add(std, name); + else + attr_uv = hair->attributes.add(name, TypeFloat2, ATTR_ELEMENT_CURVE); - float2 *uv = attr_uv->data_float2(); + float2 *uv = attr_uv->data_float2(); - if (uv) { - size_t i = 0; + if (uv) { + size_t i = 0; - for (size_t curve = 0; curve < CData.curve_uv.size(); curve++) { - uv[i++] = CData.curve_uv[curve]; - } + for (size_t curve = 0; curve < CData.curve_uv.size(); curve++) { + uv[i++] = CData.curve_uv[curve]; } } } @@ -1154,7 +628,6 @@ void BlenderSync::sync_particle_hair( } } -#ifdef WITH_NEW_OBJECT_TYPES static float4 hair_point_as_float4(BL::HairPoint b_point) { float4 mP = float3_to_float4(get_float3(b_point.co())); @@ -1320,12 +793,10 @@ static void export_hair_curves_motion(Hair *hair, BL::Hair b_hair, int motion_st export_hair_motion_validate_attribute(hair, motion_step, num_motion_keys, have_motion); } } -#endif /* WITH_NEW_OBJECT_TYPES */ /* Hair object. */ void BlenderSync::sync_hair(Hair *hair, BL::Object &b_ob, bool motion, int motion_step) { -#ifdef WITH_NEW_OBJECT_TYPES /* Convert Blender hair to Cycles curves. */ BL::Hair b_hair(b_ob.data()); if (motion) { @@ -1334,97 +805,70 @@ void BlenderSync::sync_hair(Hair *hair, BL::Object &b_ob, bool motion, int motio else { export_hair_curves(scene, hair, b_hair); } -#else - (void)hair; - (void)b_ob; - (void)motion; - (void)motion_step; -#endif /* WITH_NEW_OBJECT_TYPES */ } void BlenderSync::sync_hair(BL::Depsgraph b_depsgraph, BL::Object b_ob, - Geometry *geom, + Hair *hair, const vector<Shader *> &used_shaders) { - Hair *hair = (geom->type == Geometry::HAIR) ? static_cast<Hair *>(geom) : NULL; - Mesh *mesh = (geom->type == Geometry::MESH) ? static_cast<Mesh *>(geom) : NULL; - /* Compares curve_keys rather than strands in order to handle quick hair * adjustments in dynamic BVH - other methods could probably do this better. */ array<float3> oldcurve_keys; array<float> oldcurve_radius; - array<int> oldtriangles; - if (hair) { - oldcurve_keys.steal_data(hair->curve_keys); - oldcurve_radius.steal_data(hair->curve_radius); - } - else { - oldtriangles.steal_data(mesh->triangles); - } + oldcurve_keys.steal_data(hair->curve_keys); + oldcurve_radius.steal_data(hair->curve_radius); - geom->clear(); - geom->used_shaders = used_shaders; + hair->clear(); + hair->used_shaders = used_shaders; - if (view_layer.use_hair && scene->curve_system_manager->use_curves) { -#ifdef WITH_NEW_OBJECT_TYPES + if (view_layer.use_hair) { if (b_ob.type() == BL::Object::type_HAIR) { /* Hair object. */ sync_hair(hair, b_ob, false); - assert(mesh == NULL); } - else -#endif - { + else { /* Particle hair. */ - bool need_undeformed = geom->need_attribute(scene, ATTR_STD_GENERATED); + bool need_undeformed = hair->need_attribute(scene, ATTR_STD_GENERATED); BL::Mesh b_mesh = object_to_mesh( b_data, b_ob, b_depsgraph, need_undeformed, Mesh::SUBDIVISION_NONE); if (b_mesh) { - sync_particle_hair(geom, b_mesh, b_ob, false); + sync_particle_hair(hair, b_mesh, b_ob, false); free_object_to_mesh(b_data, b_ob, b_mesh); } } } /* tag update */ - const bool rebuild = (hair && ((oldcurve_keys != hair->curve_keys) || - (oldcurve_radius != hair->curve_radius))) || - (mesh && (oldtriangles != mesh->triangles)); + const bool rebuild = ((oldcurve_keys != hair->curve_keys) || + (oldcurve_radius != hair->curve_radius)); - geom->tag_update(scene, rebuild); + hair->tag_update(scene, rebuild); } void BlenderSync::sync_hair_motion(BL::Depsgraph b_depsgraph, BL::Object b_ob, - Geometry *geom, + Hair *hair, int motion_step) { - Hair *hair = (geom->type == Geometry::HAIR) ? static_cast<Hair *>(geom) : NULL; - Mesh *mesh = (geom->type == Geometry::MESH) ? static_cast<Mesh *>(geom) : NULL; - /* Skip if nothing exported. */ - if ((hair && hair->num_keys() == 0) || (mesh && mesh->verts.size() == 0)) { + if (hair->num_keys() == 0) { return; } /* Export deformed coordinates. */ if (ccl::BKE_object_is_deform_modified(b_ob, b_scene, preview)) { -#ifdef WITH_NEW_OBJECT_TYPES if (b_ob.type() == BL::Object::type_HAIR) { /* Hair object. */ sync_hair(hair, b_ob, true, motion_step); - assert(mesh == NULL); return; } - else -#endif - { + else { /* Particle hair. */ BL::Mesh b_mesh = object_to_mesh(b_data, b_ob, b_depsgraph, false, Mesh::SUBDIVISION_NONE); if (b_mesh) { - sync_particle_hair(geom, b_mesh, b_ob, true, motion_step); + sync_particle_hair(hair, b_mesh, b_ob, true, motion_step); free_object_to_mesh(b_data, b_ob, b_mesh); return; } @@ -1432,12 +876,7 @@ void BlenderSync::sync_hair_motion(BL::Depsgraph b_depsgraph, } /* No deformation on this frame, copy coordinates if other frames did have it. */ - if (hair) { - hair->copy_center_to_motion_step(motion_step); - } - else { - mesh->copy_center_to_motion_step(motion_step); - } + hair->copy_center_to_motion_step(motion_step); } CCL_NAMESPACE_END diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp index ac52948806c..fb9ab9e8c97 100644 --- a/intern/cycles/blender/blender_device.cpp +++ b/intern/cycles/blender/blender_device.cpp @@ -21,13 +21,6 @@ CCL_NAMESPACE_BEGIN -enum DenoiserType { - DENOISER_NONE = 0, - DENOISER_OPTIX = 1, - - DENOISER_NUM -}; - enum ComputeDevice { COMPUTE_DEVICE_CPU = 0, COMPUTE_DEVICE_CUDA = 1, @@ -120,49 +113,6 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen } } - /* Ensure there is an OptiX device when using the OptiX denoiser. */ - bool use_optix_denoising = get_enum(cscene, "preview_denoising", DENOISER_NUM, DENOISER_NONE) == - DENOISER_OPTIX && - !background; - BL::Scene::view_layers_iterator b_view_layer; - for (b_scene.view_layers.begin(b_view_layer); b_view_layer != b_scene.view_layers.end(); - ++b_view_layer) { - PointerRNA crl = RNA_pointer_get(&b_view_layer->ptr, "cycles"); - if (get_boolean(crl, "use_optix_denoising")) { - use_optix_denoising = true; - } - } - - if (use_optix_denoising && device.type != DEVICE_OPTIX) { - vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX); - if (!optix_devices.empty()) { - /* Convert to a special multi device with separate denoising devices. */ - if (device.multi_devices.empty()) { - device.multi_devices.push_back(device); - } - - /* Try to use the same physical devices for denoising. */ - for (const DeviceInfo &cuda_device : device.multi_devices) { - if (cuda_device.type == DEVICE_CUDA) { - for (const DeviceInfo &optix_device : optix_devices) { - if (cuda_device.num == optix_device.num) { - device.id += optix_device.id; - device.denoising_devices.push_back(optix_device); - break; - } - } - } - } - - if (device.denoising_devices.empty()) { - /* Simply use the first available OptiX device. */ - const DeviceInfo optix_device = optix_devices.front(); - device.id += optix_device.id; /* Uniquely identify this special multi device. */ - device.denoising_devices.push_back(optix_device); - } - } - } - return device; } diff --git a/intern/cycles/blender/blender_geometry.cpp b/intern/cycles/blender/blender_geometry.cpp index 7ca35cff961..f7e4623024d 100644 --- a/intern/cycles/blender/blender_geometry.cpp +++ b/intern/cycles/blender/blender_geometry.cpp @@ -40,17 +40,9 @@ Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph, BL::Material material_override = view_layer.material_override; Shader *default_shader = (b_ob.type() == BL::Object::type_VOLUME) ? scene->default_volume : scene->default_surface; -#ifdef WITH_NEW_OBJECT_TYPES - Geometry::Type geom_type = ((b_ob.type() == BL::Object::type_HAIR || use_particle_hair) && - (scene->curve_system_manager->primitive != CURVE_TRIANGLES)) ? + Geometry::Type geom_type = (b_ob.type() == BL::Object::type_HAIR || use_particle_hair) ? Geometry::HAIR : Geometry::MESH; -#else - Geometry::Type geom_type = ((use_particle_hair) && - (scene->curve_system_manager->primitive != CURVE_TRIANGLES)) ? - Geometry::HAIR : - Geometry::MESH; -#endif /* Find shader indices. */ vector<Shader *> used_shaders; @@ -129,12 +121,9 @@ Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph, geom->name = ustring(b_ob_data.name().c_str()); -#ifdef WITH_NEW_OBJECT_TYPES if (b_ob.type() == BL::Object::type_HAIR || use_particle_hair) { -#else - if (use_particle_hair) { -#endif - sync_hair(b_depsgraph, b_ob, geom, used_shaders); + Hair *hair = static_cast<Hair *>(geom); + sync_hair(b_depsgraph, b_ob, hair, used_shaders); } else if (b_ob.type() == BL::Object::type_VOLUME || object_fluid_gas_domain_find(b_ob)) { Mesh *mesh = static_cast<Mesh *>(geom); @@ -173,12 +162,9 @@ void BlenderSync::sync_geometry_motion(BL::Depsgraph &b_depsgraph, return; } -#ifdef WITH_NEW_OBJECT_TYPES if (b_ob.type() == BL::Object::type_HAIR || use_particle_hair) { -#else - if (use_particle_hair) { -#endif - sync_hair_motion(b_depsgraph, b_ob, geom, motion_step); + Hair *hair = static_cast<Hair *>(geom); + sync_hair_motion(b_depsgraph, b_ob, hair, motion_step); } else if (b_ob.type() == BL::Object::type_VOLUME || object_fluid_gas_domain_find(b_ob)) { /* No volume motion blur support yet. */ diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp index a6f380a9ae7..49407799fcd 100644 --- a/intern/cycles/blender/blender_mesh.cpp +++ b/intern/cycles/blender/blender_mesh.cpp @@ -278,25 +278,59 @@ static void mikk_compute_tangents( genTangSpaceDefault(&context); } +/* Create sculpt vertex color attributes. */ +static void attr_create_sculpt_vertex_color(Scene *scene, + Mesh *mesh, + BL::Mesh &b_mesh, + bool subdivision) +{ + BL::Mesh::sculpt_vertex_colors_iterator l; + + for (b_mesh.sculpt_vertex_colors.begin(l); l != b_mesh.sculpt_vertex_colors.end(); ++l) { + const bool active_render = l->active_render(); + AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE; + ustring vcol_name = ustring(l->name().c_str()); + + const bool need_vcol = mesh->need_attribute(scene, vcol_name) || + mesh->need_attribute(scene, vcol_std); + + if (!need_vcol) { + continue; + } + + AttributeSet &attributes = (subdivision) ? mesh->subd_attributes : mesh->attributes; + Attribute *vcol_attr = attributes.add(vcol_name, TypeRGBA, ATTR_ELEMENT_VERTEX); + vcol_attr->std = vcol_std; + + float4 *cdata = vcol_attr->data_float4(); + int numverts = b_mesh.vertices.length(); + + for (int i = 0; i < numverts; i++) { + *(cdata++) = get_float4(l->data[i].color()); + } + } +} + /* Create vertex color attributes. */ static void attr_create_vertex_color(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh, bool subdivision) { - if (subdivision) { - BL::Mesh::vertex_colors_iterator l; + BL::Mesh::vertex_colors_iterator l; - for (b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l) { - const bool active_render = l->active_render(); - AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE; - ustring vcol_name = ustring(l->name().c_str()); + for (b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l) { + const bool active_render = l->active_render(); + AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE; + ustring vcol_name = ustring(l->name().c_str()); - const bool need_vcol = mesh->need_attribute(scene, vcol_name) || - mesh->need_attribute(scene, vcol_std); + const bool need_vcol = mesh->need_attribute(scene, vcol_name) || + mesh->need_attribute(scene, vcol_std); - if (!need_vcol) { - continue; - } + if (!need_vcol) { + continue; + } - Attribute *vcol_attr = NULL; + Attribute *vcol_attr = NULL; + + if (subdivision) { if (active_render) { vcol_attr = mesh->subd_attributes.add(vcol_std, vcol_name); } @@ -316,22 +350,7 @@ static void attr_create_vertex_color(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh, } } } - } - else { - BL::Mesh::vertex_colors_iterator l; - for (b_mesh.vertex_colors.begin(l); l != b_mesh.vertex_colors.end(); ++l) { - const bool active_render = l->active_render(); - AttributeStandard vcol_std = (active_render) ? ATTR_STD_VERTEX_COLOR : ATTR_STD_NONE; - ustring vcol_name = ustring(l->name().c_str()); - - const bool need_vcol = mesh->need_attribute(scene, vcol_name) || - mesh->need_attribute(scene, vcol_std); - - if (!need_vcol) { - continue; - } - - Attribute *vcol_attr = NULL; + else { if (active_render) { vcol_attr = mesh->attributes.add(vcol_std, vcol_name); } @@ -828,6 +847,7 @@ static void create_mesh(Scene *scene, */ attr_create_pointiness(scene, mesh, b_mesh, subdivision); attr_create_vertex_color(scene, mesh, b_mesh, subdivision); + attr_create_sculpt_vertex_color(scene, mesh, b_mesh, subdivision); attr_create_random_per_island(scene, mesh, b_mesh, subdivision); if (subdivision) { diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp index c28586d0f63..d3a37563ef4 100644 --- a/intern/cycles/blender/blender_object.cpp +++ b/intern/cycles/blender/blender_object.cpp @@ -69,11 +69,7 @@ bool BlenderSync::object_is_mesh(BL::Object &b_ob) BL::Object::type_enum type = b_ob.type(); -#ifdef WITH_NEW_OBJECT_TYPES if (type == BL::Object::type_VOLUME || type == BL::Object::type_HAIR) { -#else - if (type == BL::Object::type_VOLUME) { -#endif /* Will be exported attached to mesh. */ return true; } diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 0be19dbffd1..3e595c3ee52 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -31,8 +31,10 @@ #include "util/util_logging.h" #include "util/util_md5.h" #include "util/util_opengl.h" +#include "util/util_openimagedenoise.h" #include "util/util_path.h" #include "util/util_string.h" +#include "util/util_task.h" #include "util/util_types.h" #ifdef WITH_OSL @@ -1075,5 +1077,14 @@ void *CCL_python_module_init() Py_INCREF(Py_False); #endif /* WITH_EMBREE */ + if (ccl::openimagedenoise_supported()) { + PyModule_AddObject(mod, "with_openimagedenoise", Py_True); + Py_INCREF(Py_True); + } + else { + PyModule_AddObject(mod, "with_openimagedenoise", Py_False); + Py_INCREF(Py_False); + } + return (void *)mod; } diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index dbe87ce2b13..391a1b8f473 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -158,7 +158,7 @@ void BlenderSession::create_session() /* set buffer parameters */ BufferParams buffer_params = BlenderSync::get_buffer_params( - b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height); + b_render, b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use); session->reset(buffer_params, session_params.samples); b_engine.use_highlight_tiles(session_params.progressive_refine == false); @@ -239,8 +239,13 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg BL::SpaceView3D b_null_space_view3d(PointerRNA_NULL); BL::RegionView3D b_null_region_view3d(PointerRNA_NULL); - BufferParams buffer_params = BlenderSync::get_buffer_params( - b_scene, b_render, b_null_space_view3d, b_null_region_view3d, scene->camera, width, height); + BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, + b_null_space_view3d, + b_null_region_view3d, + scene->camera, + width, + height, + session_params.denoising.use); session->reset(buffer_params, session_params.samples); b_engine.use_highlight_tiles(session_params.progressive_refine == false); @@ -468,14 +473,13 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_) session->update_render_tile_cb = function_bind( &BlenderSession::update_render_tile, this, _1, _2); + BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval(); + /* get buffer parameters */ SessionParams session_params = BlenderSync::get_session_params( - b_engine, b_userpref, b_scene, background); + b_engine, b_userpref, b_scene, background, b_view_layer); BufferParams buffer_params = BlenderSync::get_buffer_params( - b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height); - - /* render each layer */ - BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval(); + b_render, b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use); /* temporary render result to find needed passes and views */ BL::RenderResult b_rr = begin_render_result( @@ -485,35 +489,26 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_) BL::RenderLayer b_rlay = *b_single_rlay; b_rlay_name = b_view_layer.name(); - /* add passes */ - vector<Pass> passes = sync->sync_render_passes( - b_rlay, b_view_layer, session_params.adaptive_sampling); - buffer_params.passes = passes; + /* Update denoising parameters. */ + session->set_denoising(session_params.denoising); - PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles"); - bool use_denoising = get_boolean(crl, "use_denoising"); - bool use_optix_denoising = get_boolean(crl, "use_optix_denoising"); - bool write_denoising_passes = get_boolean(crl, "denoising_store_passes"); + bool use_denoising = session_params.denoising.use; + bool store_denoising_passes = session_params.denoising.store_passes; - buffer_params.denoising_data_pass = use_denoising || write_denoising_passes; + buffer_params.denoising_data_pass = use_denoising || store_denoising_passes; buffer_params.denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES); - buffer_params.denoising_prefiltered_pass = write_denoising_passes && !use_optix_denoising; - - session->params.run_denoising = use_denoising || write_denoising_passes; - session->params.full_denoising = use_denoising && !use_optix_denoising; - session->params.optix_denoising = use_denoising && use_optix_denoising; - session->params.write_denoising_passes = write_denoising_passes && !use_optix_denoising; - session->params.denoising.radius = get_int(crl, "denoising_radius"); - session->params.denoising.strength = get_float(crl, "denoising_strength"); - session->params.denoising.feature_strength = get_float(crl, "denoising_feature_strength"); - session->params.denoising.relative_pca = get_boolean(crl, "denoising_relative_pca"); - session->params.denoising.optix_input_passes = get_enum(crl, "denoising_optix_input_passes"); - session->tile_manager.schedule_denoising = session->params.run_denoising; + buffer_params.denoising_prefiltered_pass = store_denoising_passes && + session_params.denoising.type == DENOISER_NLM; scene->film->denoising_data_pass = buffer_params.denoising_data_pass; scene->film->denoising_clean_pass = buffer_params.denoising_clean_pass; scene->film->denoising_prefiltered_pass = buffer_params.denoising_prefiltered_pass; + /* Add passes */ + vector<Pass> passes = sync->sync_render_passes( + b_rlay, b_view_layer, session_params.adaptive_sampling, session_params.denoising); + buffer_params.passes = passes; + scene->film->pass_alpha_threshold = b_view_layer.pass_alpha_threshold(); scene->film->tag_passes_update(scene, passes); scene->film->tag_update(scene); @@ -798,7 +793,7 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_) /* increase samples, but never decrease */ session->set_samples(session_params.samples); - session->set_denoising_start_sample(session_params.denoising_start_sample); + session->set_denoising_start_sample(session_params.denoising.start_sample); session->set_pause(session_pause); /* copy recalc flags, outside of mutex so we can decide to do the real @@ -831,21 +826,17 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_) /* get buffer parameters */ BufferParams buffer_params = BlenderSync::get_buffer_params( - b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height); + b_render, b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use); - if (session_params.device.type != DEVICE_OPTIX && - session_params.device.denoising_devices.empty()) { - /* cannot use OptiX denoising when it is not supported by the device. */ - buffer_params.denoising_data_pass = false; - } - else { - session->set_denoising(buffer_params.denoising_data_pass, true); + if (!buffer_params.denoising_data_pass) { + session_params.denoising.use = false; } + session->set_denoising(session_params.denoising); + + /* Update film if denoising data was enabled or disabled. */ if (scene->film->denoising_data_pass != buffer_params.denoising_data_pass) { scene->film->denoising_data_pass = buffer_params.denoising_data_pass; - - /* Force a scene and session reset below. */ scene->film->tag_update(scene); } @@ -917,7 +908,7 @@ bool BlenderSession::draw(int w, int h) SessionParams session_params = BlenderSync::get_session_params( b_engine, b_userpref, b_scene, background); BufferParams buffer_params = BlenderSync::get_buffer_params( - b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height); + b_render, b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use); bool session_pause = BlenderSync::get_session_pause(b_scene, background); if (session_pause == false) { @@ -935,7 +926,7 @@ bool BlenderSession::draw(int w, int h) /* draw */ BufferParams buffer_params = BlenderSync::get_buffer_params( - b_scene, b_render, b_v3d, b_rv3d, scene->camera, width, height); + b_render, b_v3d, b_rv3d, scene->camera, width, height, session->params.denoising.use); DeviceDrawParams draw_params; if (session->params.display_buffer_linear) { diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp index f207d8ae07f..19d2730dc93 100644 --- a/intern/cycles/blender/blender_shader.cpp +++ b/intern/cycles/blender/blender_shader.cpp @@ -813,6 +813,14 @@ static ShaderNode *add_node(Scene *scene, sky->sun_direction = normalize(get_float3(b_sky_node.sun_direction())); sky->turbidity = b_sky_node.turbidity(); sky->ground_albedo = b_sky_node.ground_albedo(); + sky->sun_disc = b_sky_node.sun_disc(); + sky->sun_size = b_sky_node.sun_size(); + sky->sun_elevation = b_sky_node.sun_elevation(); + sky->sun_rotation = b_sky_node.sun_rotation(); + sky->altitude = b_sky_node.altitude(); + sky->air_density = b_sky_node.air_density(); + sky->dust_density = b_sky_node.dust_density(); + sky->ozone_density = b_sky_node.ozone_density(); BL::TexMapping b_texture_mapping(b_sky_node.texture_mapping()); get_tex_mapping(&sky->tex_mapping, b_texture_mapping); node = sky; diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 09813dc8c05..bf065cc5492 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -38,6 +38,7 @@ #include "util/util_foreach.h" #include "util/util_hash.h" #include "util/util_opengl.h" +#include "util/util_openimagedenoise.h" CCL_NAMESPACE_BEGIN @@ -212,7 +213,6 @@ void BlenderSync::sync_data(BL::RenderSettings &b_render, sync_film(b_v3d); sync_shaders(b_depsgraph, b_v3d); sync_images(); - sync_curve_settings(b_depsgraph); geometry_synced.clear(); /* use for objects and motion sync */ @@ -538,7 +538,8 @@ int BlenderSync::get_denoising_pass(BL::RenderPass &b_pass) vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer, - bool adaptive_sampling) + bool adaptive_sampling, + const DenoiseParams &denoising) { vector<Pass> passes; @@ -555,16 +556,13 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, Pass::add(pass_type, passes, b_pass.name().c_str()); } - PointerRNA crp = RNA_pointer_get(&b_view_layer.ptr, "cycles"); - bool use_denoising = get_boolean(crp, "use_denoising"); - bool use_optix_denoising = get_boolean(crp, "use_optix_denoising"); - bool write_denoising_passes = get_boolean(crp, "denoising_store_passes"); + PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles"); scene->film->denoising_flags = 0; - if (use_denoising || write_denoising_passes) { - if (!use_optix_denoising) { + if (denoising.use || denoising.store_passes) { + if (denoising.type == DENOISER_NLM) { #define MAP_OPTION(name, flag) \ - if (!get_boolean(crp, name)) \ + if (!get_boolean(crl, name)) \ scene->film->denoising_flags |= flag; MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR); MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND); @@ -577,11 +575,11 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str()); } - if (write_denoising_passes) { + if (denoising.store_passes) { b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str()); b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str()); b_engine.add_pass("Denoising Depth", 1, "Z", b_view_layer.name().c_str()); - if (!use_optix_denoising) { + if (denoising.type == DENOISER_NLM) { b_engine.add_pass("Denoising Shadowing", 1, "X", b_view_layer.name().c_str()); b_engine.add_pass("Denoising Variance", 3, "RGB", b_view_layer.name().c_str()); b_engine.add_pass("Denoising Intensity", 1, "X", b_view_layer.name().c_str()); @@ -593,46 +591,46 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, } #ifdef __KERNEL_DEBUG__ - if (get_boolean(crp, "pass_debug_bvh_traversed_nodes")) { + if (get_boolean(crl, "pass_debug_bvh_traversed_nodes")) { b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_view_layer.name().c_str()); Pass::add(PASS_BVH_TRAVERSED_NODES, passes, "Debug BVH Traversed Nodes"); } - if (get_boolean(crp, "pass_debug_bvh_traversed_instances")) { + if (get_boolean(crl, "pass_debug_bvh_traversed_instances")) { b_engine.add_pass("Debug BVH Traversed Instances", 1, "X", b_view_layer.name().c_str()); Pass::add(PASS_BVH_TRAVERSED_INSTANCES, passes, "Debug BVH Traversed Instances"); } - if (get_boolean(crp, "pass_debug_bvh_intersections")) { + if (get_boolean(crl, "pass_debug_bvh_intersections")) { b_engine.add_pass("Debug BVH Intersections", 1, "X", b_view_layer.name().c_str()); Pass::add(PASS_BVH_INTERSECTIONS, passes, "Debug BVH Intersections"); } - if (get_boolean(crp, "pass_debug_ray_bounces")) { + if (get_boolean(crl, "pass_debug_ray_bounces")) { b_engine.add_pass("Debug Ray Bounces", 1, "X", b_view_layer.name().c_str()); Pass::add(PASS_RAY_BOUNCES, passes, "Debug Ray Bounces"); } #endif - if (get_boolean(crp, "pass_debug_render_time")) { + if (get_boolean(crl, "pass_debug_render_time")) { b_engine.add_pass("Debug Render Time", 1, "X", b_view_layer.name().c_str()); Pass::add(PASS_RENDER_TIME, passes, "Debug Render Time"); } - if (get_boolean(crp, "pass_debug_sample_count")) { + if (get_boolean(crl, "pass_debug_sample_count")) { b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str()); Pass::add(PASS_SAMPLE_COUNT, passes, "Debug Sample Count"); } - if (get_boolean(crp, "use_pass_volume_direct")) { + if (get_boolean(crl, "use_pass_volume_direct")) { b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str()); Pass::add(PASS_VOLUME_DIRECT, passes, "VolumeDir"); } - if (get_boolean(crp, "use_pass_volume_indirect")) { + if (get_boolean(crl, "use_pass_volume_indirect")) { b_engine.add_pass("VolumeInd", 3, "RGB", b_view_layer.name().c_str()); Pass::add(PASS_VOLUME_INDIRECT, passes, "VolumeInd"); } /* Cryptomatte stores two ID/weight pairs per RGBA layer. * User facing parameter is the number of pairs. */ - int crypto_depth = divide_up(min(16, get_int(crp, "pass_crypto_depth")), 2); + int crypto_depth = divide_up(min(16, get_int(crl, "pass_crypto_depth")), 2); scene->film->cryptomatte_depth = crypto_depth; scene->film->cryptomatte_passes = CRYPT_NONE; - if (get_boolean(crp, "use_pass_crypto_object")) { + if (get_boolean(crl, "use_pass_crypto_object")) { for (int i = 0; i < crypto_depth; i++) { string passname = cryptomatte_prefix + string_printf("Object%02d", i); b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str()); @@ -641,7 +639,7 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes | CRYPT_OBJECT); } - if (get_boolean(crp, "use_pass_crypto_material")) { + if (get_boolean(crl, "use_pass_crypto_material")) { for (int i = 0; i < crypto_depth; i++) { string passname = cryptomatte_prefix + string_printf("Material%02d", i); b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str()); @@ -650,7 +648,7 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes | CRYPT_MATERIAL); } - if (get_boolean(crp, "use_pass_crypto_asset")) { + if (get_boolean(crl, "use_pass_crypto_asset")) { for (int i = 0; i < crypto_depth; i++) { string passname = cryptomatte_prefix + string_printf("Asset%02d", i); b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str()); @@ -659,19 +657,19 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes | CRYPT_ASSET); } - if (get_boolean(crp, "pass_crypto_accurate") && scene->film->cryptomatte_passes != CRYPT_NONE) { + if (get_boolean(crl, "pass_crypto_accurate") && scene->film->cryptomatte_passes != CRYPT_NONE) { scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes | CRYPT_ACCURATE); } if (adaptive_sampling) { Pass::add(PASS_ADAPTIVE_AUX_BUFFER, passes); - if (!get_boolean(crp, "pass_debug_sample_count")) { + if (!get_boolean(crl, "pass_debug_sample_count")) { Pass::add(PASS_SAMPLE_COUNT, passes); } } - RNA_BEGIN (&crp, b_aov, "aovs") { + RNA_BEGIN (&crl, b_aov, "aovs") { bool is_color = (get_enum(b_aov, "type") == 1); string name = get_string(b_aov, "name"); @@ -732,6 +730,11 @@ SceneParams BlenderSync::get_scene_params(BL::Scene &b_scene, bool background) params.use_bvh_unaligned_nodes = RNA_boolean_get(&cscene, "debug_use_hair_bvh"); params.num_bvh_time_steps = RNA_int_get(&cscene, "debug_bvh_time_steps"); + PointerRNA csscene = RNA_pointer_get(&b_scene.ptr, "cycles_curves"); + params.hair_subdivisions = get_int(csscene, "subdivisions"); + params.hair_shape = (CurveShapeType)get_enum( + csscene, "shape", CURVE_NUM_SHAPE_TYPES, CURVE_THICK); + if (background && params.shadingsystem != SHADINGSYSTEM_OSL) params.persistent_data = r.use_persistent_data(); else @@ -751,20 +754,7 @@ SceneParams BlenderSync::get_scene_params(BL::Scene &b_scene, bool background) params.texture_limit = 0; } - /* TODO(sergey): Once OSL supports per-microarchitecture optimization get - * rid of this. - */ - if (params.shadingsystem == SHADINGSYSTEM_OSL) { - params.bvh_layout = BVH_LAYOUT_BVH4; - } - else { - params.bvh_layout = DebugFlags().cpu.bvh_layout; - } - -#ifdef WITH_EMBREE - params.bvh_layout = RNA_boolean_get(&cscene, "use_bvh_embree") ? BVH_LAYOUT_EMBREE : - params.bvh_layout; -#endif + params.bvh_layout = DebugFlags().cpu.bvh_layout; params.background = background; @@ -782,7 +772,8 @@ bool BlenderSync::get_session_pause(BL::Scene &b_scene, bool background) SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine, BL::Preferences &b_preferences, BL::Scene &b_scene, - bool background) + bool background, + BL::ViewLayer b_view_layer) { SessionParams params; PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); @@ -860,9 +851,22 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine, params.tile_order = TILE_BOTTOM_TO_TOP; } - /* other parameters */ + /* Denoising */ + params.denoising = get_denoise_params(b_scene, b_view_layer, background); + + if (params.denoising.use) { + /* Add additional denoising devices if we are rendering and denoising + * with different devices. */ + params.device.add_denoising_devices(params.denoising.type); + + /* Check if denoiser is supported by device. */ + if (!(params.device.denoisers & params.denoising.type)) { + params.denoising.use = false; + } + } + + /* Viewport Performance */ params.start_resolution = get_int(cscene, "preview_start_resolution"); - params.denoising_start_sample = get_int(cscene, "preview_denoising_start_sample"); params.pixel_size = b_engine.get_preview_pixel_size(b_scene); /* other parameters */ @@ -915,4 +919,55 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine, return params; } +DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene, + BL::ViewLayer &b_view_layer, + bool background) +{ + DenoiseParams denoising; + PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); + + if (background) { + /* Final Render Denoising */ + denoising.use = get_boolean(cscene, "use_denoising"); + denoising.type = (DenoiserType)get_enum(cscene, "denoiser", DENOISER_NUM, DENOISER_NONE); + + if (b_view_layer) { + PointerRNA clayer = RNA_pointer_get(&b_view_layer.ptr, "cycles"); + if (!get_boolean(clayer, "use_denoising")) { + denoising.use = false; + } + + denoising.radius = get_int(clayer, "denoising_radius"); + denoising.strength = get_float(clayer, "denoising_strength"); + denoising.feature_strength = get_float(clayer, "denoising_feature_strength"); + denoising.relative_pca = get_boolean(clayer, "denoising_relative_pca"); + denoising.optix_input_passes = get_enum(clayer, "denoising_optix_input_passes"); + + denoising.store_passes = get_boolean(clayer, "denoising_store_passes"); + } + } + else { + /* Viewport Denoising */ + denoising.use = get_boolean(cscene, "use_preview_denoising"); + denoising.type = (DenoiserType)get_enum( + cscene, "preview_denoiser", DENOISER_NUM, DENOISER_NONE); + denoising.start_sample = get_int(cscene, "preview_denoising_start_sample"); + + /* Auto select fastest denoiser. */ + if (denoising.type == DENOISER_NONE) { + if (!Device::available_devices(DEVICE_MASK_OPTIX).empty()) { + denoising.type = DENOISER_OPTIX; + } + else if (openimagedenoise_supported()) { + denoising.type = DENOISER_OPENIMAGEDENOISE; + } + else { + denoising.use = false; + } + } + } + + return denoising; +} + CCL_NAMESPACE_END diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h index 341281b18ee..0214d9eb3b8 100644 --- a/intern/cycles/blender/blender_sync.h +++ b/intern/cycles/blender/blender_sync.h @@ -75,7 +75,8 @@ class BlenderSync { void sync_view_layer(BL::SpaceView3D &b_v3d, BL::ViewLayer &b_view_layer); vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer, BL::ViewLayer &b_view_layer, - bool adaptive_sampling); + bool adaptive_sampling, + const DenoiseParams &denoising); void sync_integrator(); void sync_camera(BL::RenderSettings &b_render, BL::Object &b_override, @@ -94,23 +95,29 @@ class BlenderSync { /* get parameters */ static SceneParams get_scene_params(BL::Scene &b_scene, bool background); - static SessionParams get_session_params(BL::RenderEngine &b_engine, - BL::Preferences &b_userpref, - BL::Scene &b_scene, - bool background); + static SessionParams get_session_params( + BL::RenderEngine &b_engine, + BL::Preferences &b_userpref, + BL::Scene &b_scene, + bool background, + BL::ViewLayer b_view_layer = BL::ViewLayer(PointerRNA_NULL)); static bool get_session_pause(BL::Scene &b_scene, bool background); - static BufferParams get_buffer_params(BL::Scene &b_scene, - BL::RenderSettings &b_render, + static BufferParams get_buffer_params(BL::RenderSettings &b_render, BL::SpaceView3D &b_v3d, BL::RegionView3D &b_rv3d, Camera *cam, int width, - int height); + int height, + const bool use_denoiser); static PassType get_pass_type(BL::RenderPass &b_pass); static int get_denoising_pass(BL::RenderPass &b_pass); private: + static DenoiseParams get_denoise_params(BL::Scene &b_scene, + BL::ViewLayer &b_view_layer, + bool background); + /* sync */ void sync_lights(BL::Depsgraph &b_depsgraph, bool update_all); void sync_materials(BL::Depsgraph &b_depsgraph, bool update_all); @@ -153,16 +160,12 @@ class BlenderSync { /* Hair */ void sync_hair(BL::Depsgraph b_depsgraph, BL::Object b_ob, - Geometry *geom, + Hair *hair, const vector<Shader *> &used_shaders); - void sync_hair_motion(BL::Depsgraph b_depsgraph, - BL::Object b_ob, - Geometry *geom, - int motion_step); + void sync_hair_motion(BL::Depsgraph b_depsgraph, BL::Object b_ob, Hair *hair, int motion_step); void sync_hair(Hair *hair, BL::Object &b_ob, bool motion, int motion_step = 0); void sync_particle_hair( - Geometry *geom, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step = 0); - void sync_curve_settings(BL::Depsgraph &b_depsgraph); + Hair *hair, BL::Mesh &b_mesh, BL::Object &b_ob, bool motion, int motion_step = 0); bool object_has_particle_hair(BL::Object b_ob); /* Camera */ diff --git a/intern/cycles/blender/blender_viewport.cpp b/intern/cycles/blender/blender_viewport.cpp index 93e84e28032..73ef5f94720 100644 --- a/intern/cycles/blender/blender_viewport.cpp +++ b/intern/cycles/blender/blender_viewport.cpp @@ -61,17 +61,6 @@ const bool BlenderViewportParameters::custom_viewport_parameters() const return !(use_scene_world && use_scene_lights); } -bool BlenderViewportParameters::get_viewport_display_denoising(BL::SpaceView3D &b_v3d, - BL::Scene &b_scene) -{ - bool use_denoising = false; - if (b_v3d) { - PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); - use_denoising = get_enum(cscene, "preview_denoising") != 0; - } - return use_denoising; -} - PassType BlenderViewportParameters::get_viewport_display_render_pass(BL::SpaceView3D &b_v3d) { PassType display_pass = PASS_NONE; @@ -83,11 +72,6 @@ PassType BlenderViewportParameters::get_viewport_display_render_pass(BL::SpaceVi return display_pass; } -bool update_viewport_display_denoising(BL::SpaceView3D &b_v3d, BL::Scene &b_scene) -{ - return BlenderViewportParameters::get_viewport_display_denoising(b_v3d, b_scene); -} - PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes) { if (b_v3d) { diff --git a/intern/cycles/blender/blender_viewport.h b/intern/cycles/blender/blender_viewport.h index 3e44e552f1d..7c6c9c4d274 100644 --- a/intern/cycles/blender/blender_viewport.h +++ b/intern/cycles/blender/blender_viewport.h @@ -44,15 +44,11 @@ class BlenderViewportParameters { friend class BlenderSync; public: - /* Get whether to enable denoising data pass in viewport. */ - static bool get_viewport_display_denoising(BL::SpaceView3D &b_v3d, BL::Scene &b_scene); /* Retrieve the render pass that needs to be displayed on the given `SpaceView3D` * When the `b_v3d` parameter is not given `PASS_NONE` will be returned. */ static PassType get_viewport_display_render_pass(BL::SpaceView3D &b_v3d); }; -bool update_viewport_display_denoising(BL::SpaceView3D &b_v3d, BL::Scene &b_scene); - PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes); CCL_NAMESPACE_END diff --git a/intern/cycles/blender/blender_volume.cpp b/intern/cycles/blender/blender_volume.cpp index 4eed6be8c7c..80591e0eec8 100644 --- a/intern/cycles/blender/blender_volume.cpp +++ b/intern/cycles/blender/blender_volume.cpp @@ -35,8 +35,10 @@ CCL_NAMESPACE_BEGIN class BlenderSmokeLoader : public ImageLoader { public: BlenderSmokeLoader(BL::Object &b_ob, AttributeStandard attribute) - : b_domain(object_fluid_gas_domain_find(b_ob)), b_mesh(b_ob.data()), attribute(attribute) + : b_domain(object_fluid_gas_domain_find(b_ob)), attribute(attribute) { + BL::Mesh b_mesh(b_ob.data()); + mesh_texture_space(b_mesh, texspace_loc, texspace_size); } bool load_metadata(ImageMetaData &metadata) override @@ -77,9 +79,7 @@ class BlenderSmokeLoader : public ImageLoader { /* Create a matrix to transform from object space to mesh texture space. * This does not work with deformations but that can probably only be done * well with a volume grid mapping of coordinates. */ - float3 loc, size; - mesh_texture_space(b_mesh, loc, size); - metadata.transform_3d = transform_translate(-loc) * transform_scale(size); + metadata.transform_3d = transform_translate(-texspace_loc) * transform_scale(texspace_size); metadata.use_transform_3d = true; return true; @@ -177,7 +177,7 @@ class BlenderSmokeLoader : public ImageLoader { } BL::FluidDomainSettings b_domain; - BL::Mesh b_mesh; + float3 texspace_loc, texspace_size; AttributeStandard attribute; }; @@ -216,25 +216,16 @@ static void sync_smoke_volume(Scene *scene, BL::Object &b_ob, Mesh *mesh, float class BlenderVolumeLoader : public VDBImageLoader { public: - BlenderVolumeLoader(BL::Volume b_volume, const string &grid_name) - : VDBImageLoader(grid_name), - b_volume(b_volume), - b_volume_grid(PointerRNA_NULL), - unload(false) + BlenderVolumeLoader(BL::BlendData &b_data, BL::Volume &b_volume, const string &grid_name) + : VDBImageLoader(grid_name), b_data(b_data), b_volume(b_volume), unload(false) { -#ifdef WITH_OPENVDB - /* Find grid with matching name. */ - BL::Volume::grids_iterator b_grid_iter; - for (b_volume.grids.begin(b_grid_iter); b_grid_iter != b_volume.grids.end(); ++b_grid_iter) { - if (b_grid_iter->name() == grid_name) { - b_volume_grid = *b_grid_iter; - } - } -#endif } bool load_metadata(ImageMetaData &metadata) override { + b_volume.grids.load(b_data.ptr.data); + BL::VolumeGrid b_volume_grid = find_grid(); + if (!b_volume_grid) { return false; } @@ -255,6 +246,9 @@ class BlenderVolumeLoader : public VDBImageLoader { const size_t pixel_size, const bool associate_alpha) override { + b_volume.grids.load(b_data.ptr.data); + BL::VolumeGrid b_volume_grid = find_grid(); + if (!b_volume_grid) { return false; } @@ -266,19 +260,38 @@ class BlenderVolumeLoader : public VDBImageLoader { { /* TODO: detect multiple volume datablocks with the same filepath. */ const BlenderVolumeLoader &other_loader = (const BlenderVolumeLoader &)other; - return b_volume == other_loader.b_volume && b_volume_grid == other_loader.b_volume_grid; + return b_volume == other_loader.b_volume && grid_name == other_loader.grid_name; } void cleanup() override { VDBImageLoader::cleanup(); + + BL::VolumeGrid b_volume_grid = find_grid(); if (b_volume_grid && unload) { b_volume_grid.unload(); } } + /* Find grid with matching name. Grid point not stored in the class since + * grids may be unloaded before we load the pixels, for example for motion + * blur where we move between frames. */ + BL::VolumeGrid find_grid() + { +#ifdef WITH_OPENVDB + BL::Volume::grids_iterator b_grid_iter; + for (b_volume.grids.begin(b_grid_iter); b_grid_iter != b_volume.grids.end(); ++b_grid_iter) { + if (b_grid_iter->name() == grid_name) { + return *b_grid_iter; + } + } +#endif + + return BL::VolumeGrid(PointerRNA_NULL); + } + + BL::BlendData b_data; BL::Volume b_volume; - BL::VolumeGrid b_volume_grid; bool unload; }; @@ -325,7 +338,7 @@ static void sync_volume_object(BL::BlendData &b_data, BL::Object &b_ob, Scene *s mesh->attributes.add(std) : mesh->attributes.add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_VOXEL); - ImageLoader *loader = new BlenderVolumeLoader(b_volume, name.string()); + ImageLoader *loader = new BlenderVolumeLoader(b_data, b_volume, name.string()); ImageParams params; params.frame = b_volume.grids.frame(); diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt index fb724704a84..8b8f3ca7265 100644 --- a/intern/cycles/bvh/CMakeLists.txt +++ b/intern/cycles/bvh/CMakeLists.txt @@ -9,8 +9,6 @@ set(INC_SYS set(SRC bvh.cpp bvh2.cpp - bvh4.cpp - bvh8.cpp bvh_binning.cpp bvh_build.cpp bvh_embree.cpp @@ -24,8 +22,6 @@ set(SRC set(SRC_HEADERS bvh.h bvh2.h - bvh4.h - bvh8.h bvh_binning.h bvh_build.h bvh_embree.h diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index 0313bcd68b0..e9e67fd1305 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -22,17 +22,10 @@ #include "render/object.h" #include "bvh/bvh2.h" -#include "bvh/bvh4.h" -#include "bvh/bvh8.h" #include "bvh/bvh_build.h" +#include "bvh/bvh_embree.h" #include "bvh/bvh_node.h" - -#ifdef WITH_OPTIX -# include "bvh/bvh_optix.h" -#endif -#ifdef WITH_EMBREE -# include "bvh/bvh_embree.h" -#endif +#include "bvh/bvh_optix.h" #include "util/util_foreach.h" #include "util/util_logging.h" @@ -47,10 +40,6 @@ const char *bvh_layout_name(BVHLayout layout) switch (layout) { case BVH_LAYOUT_BVH2: return "BVH2"; - case BVH_LAYOUT_BVH4: - return "BVH4"; - case BVH_LAYOUT_BVH8: - return "BVH8"; case BVH_LAYOUT_NONE: return "NONE"; case BVH_LAYOUT_EMBREE: @@ -114,10 +103,6 @@ BVH *BVH::create(const BVHParams ¶ms, switch (params.bvh_layout) { case BVH_LAYOUT_BVH2: return new BVH2(params, geometry, objects); - case BVH_LAYOUT_BVH4: - return new BVH4(params, geometry, objects); - case BVH_LAYOUT_BVH8: - return new BVH8(params, geometry, objects); case BVH_LAYOUT_EMBREE: #ifdef WITH_EMBREE return new BVHEmbree(params, geometry, objects); @@ -337,13 +322,6 @@ void BVH::pack_primitives() void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) { - /* The BVH's for instances are built separately, but for traversal all - * BVH's are stored in global arrays. This function merges them into the - * top level BVH, adjusting indexes and offsets where appropriate. - */ - const bool use_qbvh = (params.bvh_layout == BVH_LAYOUT_BVH4); - const bool use_obvh = (params.bvh_layout == BVH_LAYOUT_BVH8); - /* Adjust primitive index to point to the triangle in the global array, for * geometry with transform applied and already in the top level BVH. */ @@ -506,53 +484,21 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) for (size_t i = 0, j = 0; i < bvh_nodes_size; j++) { size_t nsize, nsize_bbox; if (bvh_nodes[i].x & PATH_RAY_NODE_UNALIGNED) { - if (use_obvh) { - nsize = BVH_UNALIGNED_ONODE_SIZE; - nsize_bbox = BVH_UNALIGNED_ONODE_SIZE - 1; - } - else { - nsize = use_qbvh ? BVH_UNALIGNED_QNODE_SIZE : BVH_UNALIGNED_NODE_SIZE; - nsize_bbox = (use_qbvh) ? BVH_UNALIGNED_QNODE_SIZE - 1 : 0; - } + nsize = BVH_UNALIGNED_NODE_SIZE; + nsize_bbox = 0; } else { - if (use_obvh) { - nsize = BVH_ONODE_SIZE; - nsize_bbox = BVH_ONODE_SIZE - 1; - } - else { - nsize = (use_qbvh) ? BVH_QNODE_SIZE : BVH_NODE_SIZE; - nsize_bbox = (use_qbvh) ? BVH_QNODE_SIZE - 1 : 0; - } + nsize = BVH_NODE_SIZE; + nsize_bbox = 0; } memcpy(pack_nodes + pack_nodes_offset, bvh_nodes + i, nsize_bbox * sizeof(int4)); /* Modify offsets into arrays */ int4 data = bvh_nodes[i + nsize_bbox]; - - if (use_obvh) { - int4 data1 = bvh_nodes[i + nsize_bbox - 1]; - data.z += (data.z < 0) ? -noffset_leaf : noffset; - data.w += (data.w < 0) ? -noffset_leaf : noffset; - data.x += (data.x < 0) ? -noffset_leaf : noffset; - data.y += (data.y < 0) ? -noffset_leaf : noffset; - data1.z += (data1.z < 0) ? -noffset_leaf : noffset; - data1.w += (data1.w < 0) ? -noffset_leaf : noffset; - data1.x += (data1.x < 0) ? -noffset_leaf : noffset; - data1.y += (data1.y < 0) ? -noffset_leaf : noffset; - pack_nodes[pack_nodes_offset + nsize_bbox] = data; - pack_nodes[pack_nodes_offset + nsize_bbox - 1] = data1; - } - else { - data.z += (data.z < 0) ? -noffset_leaf : noffset; - data.w += (data.w < 0) ? -noffset_leaf : noffset; - if (use_qbvh) { - data.x += (data.x < 0) ? -noffset_leaf : noffset; - data.y += (data.y < 0) ? -noffset_leaf : noffset; - } - pack_nodes[pack_nodes_offset + nsize_bbox] = data; - } + data.z += (data.z < 0) ? -noffset_leaf : noffset; + data.w += (data.w < 0) ? -noffset_leaf : noffset; + pack_nodes[pack_nodes_offset + nsize_bbox] = data; /* Usually this copies nothing, but we better * be prepared for possible node size extension. diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h index bdde38640c9..6639e06b0bc 100644 --- a/intern/cycles/bvh/bvh.h +++ b/intern/cycles/bvh/bvh.h @@ -76,7 +76,7 @@ struct PackedBVH { } }; -enum BVH_TYPE { bvh2, bvh4, bvh8 }; +enum BVH_TYPE { bvh2 }; /* BVH */ diff --git a/intern/cycles/bvh/bvh4.cpp b/intern/cycles/bvh/bvh4.cpp deleted file mode 100644 index 143c3e54f94..00000000000 --- a/intern/cycles/bvh/bvh4.cpp +++ /dev/null @@ -1,447 +0,0 @@ -/* - * Adapted from code copyright 2009-2010 NVIDIA Corporation - * Modifications Copyright 2011, Blender Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "bvh/bvh4.h" - -#include "render/mesh.h" -#include "render/object.h" - -#include "bvh/bvh_node.h" -#include "bvh/bvh_unaligned.h" - -CCL_NAMESPACE_BEGIN - -/* Can we avoid this somehow or make more generic? - * - * Perhaps we can merge nodes in actual tree and make our - * life easier all over the place. - */ - -BVH4::BVH4(const BVHParams ¶ms_, - const vector<Geometry *> &geometry_, - const vector<Object *> &objects_) - : BVH(params_, geometry_, objects_) -{ - params.bvh_layout = BVH_LAYOUT_BVH4; -} - -namespace { - -BVHNode *bvh_node_merge_children_recursively(const BVHNode *node) -{ - if (node->is_leaf()) { - return new LeafNode(*reinterpret_cast<const LeafNode *>(node)); - } - /* Collect nodes of one layer deeper, allowing us to have more children in an inner layer. */ - assert(node->num_children() <= 2); - const BVHNode *children[4]; - const BVHNode *child0 = node->get_child(0); - const BVHNode *child1 = node->get_child(1); - int num_children = 0; - if (child0->is_leaf()) { - children[num_children++] = child0; - } - else { - children[num_children++] = child0->get_child(0); - children[num_children++] = child0->get_child(1); - } - if (child1->is_leaf()) { - children[num_children++] = child1; - } - else { - children[num_children++] = child1->get_child(0); - children[num_children++] = child1->get_child(1); - } - /* Merge children in subtrees. */ - BVHNode *children4[4]; - for (int i = 0; i < num_children; ++i) { - children4[i] = bvh_node_merge_children_recursively(children[i]); - } - /* Allocate new node. */ - BVHNode *node4 = new InnerNode(node->bounds, children4, num_children); - /* TODO(sergey): Consider doing this from the InnerNode() constructor. - * But in order to do this nicely need to think of how to pass all the - * parameters there. */ - if (node->is_unaligned) { - node4->is_unaligned = true; - node4->aligned_space = new Transform(); - *node4->aligned_space = *node->aligned_space; - } - return node4; -} - -} // namespace - -BVHNode *BVH4::widen_children_nodes(const BVHNode *root) -{ - if (root == NULL) { - return NULL; - } - if (root->is_leaf()) { - return const_cast<BVHNode *>(root); - } - BVHNode *root4 = bvh_node_merge_children_recursively(root); - /* TODO(sergey): Pack children nodes to parents which has less that 4 - * children. */ - return root4; -} - -void BVH4::pack_leaf(const BVHStackEntry &e, const LeafNode *leaf) -{ - float4 data[BVH_QNODE_LEAF_SIZE]; - memset(data, 0, sizeof(data)); - if (leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) { - /* object */ - data[0].x = __int_as_float(~(leaf->lo)); - data[0].y = __int_as_float(0); - } - else { - /* triangle */ - data[0].x = __int_as_float(leaf->lo); - data[0].y = __int_as_float(leaf->hi); - } - data[0].z = __uint_as_float(leaf->visibility); - if (leaf->num_triangles() != 0) { - data[0].w = __uint_as_float(pack.prim_type[leaf->lo]); - } - - memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4) * BVH_QNODE_LEAF_SIZE); -} - -void BVH4::pack_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num) -{ - bool has_unaligned = false; - /* Check whether we have to create unaligned node or all nodes are aligned - * and we can cut some corner here. - */ - if (params.use_unaligned_nodes) { - for (int i = 0; i < num; i++) { - if (en[i].node->is_unaligned) { - has_unaligned = true; - break; - } - } - } - if (has_unaligned) { - /* There's no unaligned children, pack into AABB node. */ - pack_unaligned_inner(e, en, num); - } - else { - /* Create unaligned node with orientation transform for each of the - * children. - */ - pack_aligned_inner(e, en, num); - } -} - -void BVH4::pack_aligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num) -{ - BoundBox bounds[4]; - int child[4]; - for (int i = 0; i < num; ++i) { - bounds[i] = en[i].node->bounds; - child[i] = en[i].encodeIdx(); - } - pack_aligned_node( - e.idx, bounds, child, e.node->visibility, e.node->time_from, e.node->time_to, num); -} - -void BVH4::pack_aligned_node(int idx, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num) -{ - float4 data[BVH_QNODE_SIZE]; - memset(data, 0, sizeof(data)); - - data[0].x = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED); - data[0].y = time_from; - data[0].z = time_to; - - for (int i = 0; i < num; i++) { - float3 bb_min = bounds[i].min; - float3 bb_max = bounds[i].max; - - data[1][i] = bb_min.x; - data[2][i] = bb_max.x; - data[3][i] = bb_min.y; - data[4][i] = bb_max.y; - data[5][i] = bb_min.z; - data[6][i] = bb_max.z; - - data[7][i] = __int_as_float(child[i]); - } - - for (int i = num; i < 4; i++) { - /* We store BB which would never be recorded as intersection - * so kernel might safely assume there are always 4 child nodes. - */ - data[1][i] = FLT_MAX; - data[2][i] = -FLT_MAX; - - data[3][i] = FLT_MAX; - data[4][i] = -FLT_MAX; - - data[5][i] = FLT_MAX; - data[6][i] = -FLT_MAX; - - data[7][i] = __int_as_float(0); - } - - memcpy(&pack.nodes[idx], data, sizeof(float4) * BVH_QNODE_SIZE); -} - -void BVH4::pack_unaligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num) -{ - Transform aligned_space[4]; - BoundBox bounds[4]; - int child[4]; - for (int i = 0; i < num; ++i) { - aligned_space[i] = en[i].node->get_aligned_space(); - bounds[i] = en[i].node->bounds; - child[i] = en[i].encodeIdx(); - } - pack_unaligned_node(e.idx, - aligned_space, - bounds, - child, - e.node->visibility, - e.node->time_from, - e.node->time_to, - num); -} - -void BVH4::pack_unaligned_node(int idx, - const Transform *aligned_space, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num) -{ - float4 data[BVH_UNALIGNED_QNODE_SIZE]; - memset(data, 0, sizeof(data)); - - data[0].x = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED); - data[0].y = time_from; - data[0].z = time_to; - - for (int i = 0; i < num; i++) { - Transform space = BVHUnaligned::compute_node_transform(bounds[i], aligned_space[i]); - - data[1][i] = space.x.x; - data[2][i] = space.x.y; - data[3][i] = space.x.z; - - data[4][i] = space.y.x; - data[5][i] = space.y.y; - data[6][i] = space.y.z; - - data[7][i] = space.z.x; - data[8][i] = space.z.y; - data[9][i] = space.z.z; - - data[10][i] = space.x.w; - data[11][i] = space.y.w; - data[12][i] = space.z.w; - - data[13][i] = __int_as_float(child[i]); - } - - for (int i = num; i < 4; i++) { - /* We store BB which would never be recorded as intersection - * so kernel might safely assume there are always 4 child nodes. - */ - - data[1][i] = NAN; - data[2][i] = NAN; - data[3][i] = NAN; - - data[4][i] = NAN; - data[5][i] = NAN; - data[6][i] = NAN; - - data[7][i] = NAN; - data[8][i] = NAN; - data[9][i] = NAN; - - data[10][i] = NAN; - data[11][i] = NAN; - data[12][i] = NAN; - - data[13][i] = __int_as_float(0); - } - - memcpy(&pack.nodes[idx], data, sizeof(float4) * BVH_UNALIGNED_QNODE_SIZE); -} - -/* Quad SIMD Nodes */ - -void BVH4::pack_nodes(const BVHNode *root) -{ - /* Calculate size of the arrays required. */ - const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT); - const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); - assert(num_leaf_nodes <= num_nodes); - const size_t num_inner_nodes = num_nodes - num_leaf_nodes; - size_t node_size; - if (params.use_unaligned_nodes) { - const size_t num_unaligned_nodes = root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT); - node_size = (num_unaligned_nodes * BVH_UNALIGNED_QNODE_SIZE) + - (num_inner_nodes - num_unaligned_nodes) * BVH_QNODE_SIZE; - } - else { - node_size = num_inner_nodes * BVH_QNODE_SIZE; - } - /* Resize arrays. */ - pack.nodes.clear(); - pack.leaf_nodes.clear(); - /* For top level BVH, first merge existing BVH's so we know the offsets. */ - if (params.top_level) { - pack_instances(node_size, num_leaf_nodes * BVH_QNODE_LEAF_SIZE); - } - else { - pack.nodes.resize(node_size); - pack.leaf_nodes.resize(num_leaf_nodes * BVH_QNODE_LEAF_SIZE); - } - - int nextNodeIdx = 0, nextLeafNodeIdx = 0; - - vector<BVHStackEntry> stack; - stack.reserve(BVHParams::MAX_DEPTH * 2); - if (root->is_leaf()) { - stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++)); - } - else { - stack.push_back(BVHStackEntry(root, nextNodeIdx)); - nextNodeIdx += root->has_unaligned() ? BVH_UNALIGNED_QNODE_SIZE : BVH_QNODE_SIZE; - } - - while (stack.size()) { - BVHStackEntry e = stack.back(); - stack.pop_back(); - - if (e.node->is_leaf()) { - /* leaf node */ - const LeafNode *leaf = reinterpret_cast<const LeafNode *>(e.node); - pack_leaf(e, leaf); - } - else { - /* Inner node. */ - /* Collect nodes. */ - const BVHNode *children[4]; - const int num_children = e.node->num_children(); - /* Push entries on the stack. */ - for (int i = 0; i < num_children; ++i) { - int idx; - children[i] = e.node->get_child(i); - assert(children[i] != NULL); - if (children[i]->is_leaf()) { - idx = nextLeafNodeIdx++; - } - else { - idx = nextNodeIdx; - nextNodeIdx += children[i]->has_unaligned() ? BVH_UNALIGNED_QNODE_SIZE : BVH_QNODE_SIZE; - } - stack.push_back(BVHStackEntry(children[i], idx)); - } - /* Set node. */ - pack_inner(e, &stack[stack.size() - num_children], num_children); - } - } - - assert(node_size == nextNodeIdx); - /* Root index to start traversal at, to handle case of single leaf node. */ - pack.root_index = (root->is_leaf()) ? -1 : 0; -} - -void BVH4::refit_nodes() -{ - assert(!params.top_level); - - BoundBox bbox = BoundBox::empty; - uint visibility = 0; - refit_node(0, (pack.root_index == -1) ? true : false, bbox, visibility); -} - -void BVH4::refit_node(int idx, bool leaf, BoundBox &bbox, uint &visibility) -{ - if (leaf) { - /* Refit leaf node. */ - int4 *data = &pack.leaf_nodes[idx]; - int4 c = data[0]; - - BVH::refit_primitives(c.x, c.y, bbox, visibility); - - /* TODO(sergey): This is actually a copy of pack_leaf(), - * but this chunk of code only knows actual data and has - * no idea about BVHNode. - * - * Would be nice to de-duplicate code, but trying to make - * making code more general ends up in much nastier code - * in my opinion so far. - * - * Same applies to the inner nodes case below. - */ - float4 leaf_data[BVH_QNODE_LEAF_SIZE]; - leaf_data[0].x = __int_as_float(c.x); - leaf_data[0].y = __int_as_float(c.y); - leaf_data[0].z = __uint_as_float(visibility); - leaf_data[0].w = __uint_as_float(c.w); - memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4) * BVH_QNODE_LEAF_SIZE); - } - else { - int4 *data = &pack.nodes[idx]; - bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0; - int4 c; - if (is_unaligned) { - c = data[13]; - } - else { - c = data[7]; - } - /* Refit inner node, set bbox from children. */ - BoundBox child_bbox[4] = {BoundBox::empty, BoundBox::empty, BoundBox::empty, BoundBox::empty}; - uint child_visibility[4] = {0}; - int num_nodes = 0; - - for (int i = 0; i < 4; ++i) { - if (c[i] != 0) { - refit_node((c[i] < 0) ? -c[i] - 1 : c[i], (c[i] < 0), child_bbox[i], child_visibility[i]); - ++num_nodes; - bbox.grow(child_bbox[i]); - visibility |= child_visibility[i]; - } - } - - if (is_unaligned) { - Transform aligned_space[4] = { - transform_identity(), transform_identity(), transform_identity(), transform_identity()}; - pack_unaligned_node( - idx, aligned_space, child_bbox, &c[0], visibility, 0.0f, 1.0f, num_nodes); - } - else { - pack_aligned_node(idx, child_bbox, &c[0], visibility, 0.0f, 1.0f, num_nodes); - } - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh4.h b/intern/cycles/bvh/bvh4.h deleted file mode 100644 index afbb9007afb..00000000000 --- a/intern/cycles/bvh/bvh4.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Adapted from code copyright 2009-2010 NVIDIA Corporation - * Modifications Copyright 2011, Blender Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __BVH4_H__ -#define __BVH4_H__ - -#include "bvh/bvh.h" -#include "bvh/bvh_params.h" - -#include "util/util_types.h" -#include "util/util_vector.h" - -CCL_NAMESPACE_BEGIN - -class BVHNode; -struct BVHStackEntry; -class BVHParams; -class BoundBox; -class LeafNode; -class Object; -class Progress; - -#define BVH_QNODE_SIZE 8 -#define BVH_QNODE_LEAF_SIZE 1 -#define BVH_UNALIGNED_QNODE_SIZE 14 - -/* BVH4 - * - * Quad BVH, with each node having four children, to use with SIMD instructions. - */ -class BVH4 : public BVH { - protected: - /* constructor */ - friend class BVH; - BVH4(const BVHParams ¶ms, - const vector<Geometry *> &geometry, - const vector<Object *> &objects); - - /* Building process. */ - virtual BVHNode *widen_children_nodes(const BVHNode *root) override; - - /* pack */ - void pack_nodes(const BVHNode *root) override; - - void pack_leaf(const BVHStackEntry &e, const LeafNode *leaf); - void pack_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num); - - void pack_aligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num); - void pack_aligned_node(int idx, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num); - - void pack_unaligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num); - void pack_unaligned_node(int idx, - const Transform *aligned_space, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num); - - /* refit */ - void refit_nodes() override; - void refit_node(int idx, bool leaf, BoundBox &bbox, uint &visibility); -}; - -CCL_NAMESPACE_END - -#endif /* __BVH4_H__ */ diff --git a/intern/cycles/bvh/bvh8.cpp b/intern/cycles/bvh/bvh8.cpp deleted file mode 100644 index b805865b2c8..00000000000 --- a/intern/cycles/bvh/bvh8.cpp +++ /dev/null @@ -1,541 +0,0 @@ -/* - * Original code Copyright 2017, Intel Corporation - * Modifications Copyright 2018, Blender Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "bvh/bvh8.h" - -#include "render/hair.h" -#include "render/mesh.h" -#include "render/object.h" - -#include "bvh/bvh_node.h" -#include "bvh/bvh_unaligned.h" - -CCL_NAMESPACE_BEGIN - -BVH8::BVH8(const BVHParams ¶ms_, - const vector<Geometry *> &geometry_, - const vector<Object *> &objects_) - : BVH(params_, geometry_, objects_) -{ -} - -namespace { - -BVHNode *bvh_node_merge_children_recursively(const BVHNode *node) -{ - if (node->is_leaf()) { - return new LeafNode(*reinterpret_cast<const LeafNode *>(node)); - } - /* Collect nodes of two layer deeper, allowing us to have more childrem in - * an inner layer. */ - assert(node->num_children() <= 2); - const BVHNode *children[8]; - const BVHNode *child0 = node->get_child(0); - const BVHNode *child1 = node->get_child(1); - int num_children = 0; - if (child0->is_leaf()) { - children[num_children++] = child0; - } - else { - const BVHNode *child00 = child0->get_child(0), *child01 = child0->get_child(1); - if (child00->is_leaf()) { - children[num_children++] = child00; - } - else { - children[num_children++] = child00->get_child(0); - children[num_children++] = child00->get_child(1); - } - if (child01->is_leaf()) { - children[num_children++] = child01; - } - else { - children[num_children++] = child01->get_child(0); - children[num_children++] = child01->get_child(1); - } - } - if (child1->is_leaf()) { - children[num_children++] = child1; - } - else { - const BVHNode *child10 = child1->get_child(0), *child11 = child1->get_child(1); - if (child10->is_leaf()) { - children[num_children++] = child10; - } - else { - children[num_children++] = child10->get_child(0); - children[num_children++] = child10->get_child(1); - } - if (child11->is_leaf()) { - children[num_children++] = child11; - } - else { - children[num_children++] = child11->get_child(0); - children[num_children++] = child11->get_child(1); - } - } - /* Merge children in subtrees. */ - BVHNode *children4[8]; - for (int i = 0; i < num_children; ++i) { - children4[i] = bvh_node_merge_children_recursively(children[i]); - } - /* Allocate new node. */ - BVHNode *node8 = new InnerNode(node->bounds, children4, num_children); - /* TODO(sergey): Consider doing this from the InnerNode() constructor. - * But in order to do this nicely need to think of how to pass all the - * parameters there. */ - if (node->is_unaligned) { - node8->is_unaligned = true; - node8->aligned_space = new Transform(); - *node8->aligned_space = *node->aligned_space; - } - return node8; -} - -} // namespace - -BVHNode *BVH8::widen_children_nodes(const BVHNode *root) -{ - if (root == NULL) { - return NULL; - } - if (root->is_leaf()) { - return const_cast<BVHNode *>(root); - } - BVHNode *root8 = bvh_node_merge_children_recursively(root); - /* TODO(sergey): Pack children nodes to parents which has less that 4 - * children. */ - return root8; -} - -void BVH8::pack_leaf(const BVHStackEntry &e, const LeafNode *leaf) -{ - float4 data[BVH_ONODE_LEAF_SIZE]; - memset(data, 0, sizeof(data)); - if (leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) { - /* object */ - data[0].x = __int_as_float(~(leaf->lo)); - data[0].y = __int_as_float(0); - } - else { - /* triangle */ - data[0].x = __int_as_float(leaf->lo); - data[0].y = __int_as_float(leaf->hi); - } - data[0].z = __uint_as_float(leaf->visibility); - if (leaf->num_triangles() != 0) { - data[0].w = __uint_as_float(pack.prim_type[leaf->lo]); - } - - memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4) * BVH_ONODE_LEAF_SIZE); -} - -void BVH8::pack_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num) -{ - bool has_unaligned = false; - /* Check whether we have to create unaligned node or all nodes are aligned - * and we can cut some corner here. - */ - if (params.use_unaligned_nodes) { - for (int i = 0; i < num; i++) { - if (en[i].node->is_unaligned) { - has_unaligned = true; - break; - } - } - } - if (has_unaligned) { - /* There's no unaligned children, pack into AABB node. */ - pack_unaligned_inner(e, en, num); - } - else { - /* Create unaligned node with orientation transform for each of the - * children. - */ - pack_aligned_inner(e, en, num); - } -} - -void BVH8::pack_aligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num) -{ - BoundBox bounds[8]; - int child[8]; - for (int i = 0; i < num; ++i) { - bounds[i] = en[i].node->bounds; - child[i] = en[i].encodeIdx(); - } - pack_aligned_node( - e.idx, bounds, child, e.node->visibility, e.node->time_from, e.node->time_to, num); -} - -void BVH8::pack_aligned_node(int idx, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num) -{ - float8 data[8]; - memset(data, 0, sizeof(data)); - - data[0].a = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED); - data[0].b = time_from; - data[0].c = time_to; - - for (int i = 0; i < num; i++) { - float3 bb_min = bounds[i].min; - float3 bb_max = bounds[i].max; - - data[1][i] = bb_min.x; - data[2][i] = bb_max.x; - data[3][i] = bb_min.y; - data[4][i] = bb_max.y; - data[5][i] = bb_min.z; - data[6][i] = bb_max.z; - - data[7][i] = __int_as_float(child[i]); - } - - for (int i = num; i < 8; i++) { - /* We store BB which would never be recorded as intersection - * so kernel might safely assume there are always 4 child nodes. - */ - data[1][i] = FLT_MAX; - data[2][i] = -FLT_MAX; - - data[3][i] = FLT_MAX; - data[4][i] = -FLT_MAX; - - data[5][i] = FLT_MAX; - data[6][i] = -FLT_MAX; - - data[7][i] = __int_as_float(0); - } - - memcpy(&pack.nodes[idx], data, sizeof(float4) * BVH_ONODE_SIZE); -} - -void BVH8::pack_unaligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num) -{ - Transform aligned_space[8]; - BoundBox bounds[8]; - int child[8]; - for (int i = 0; i < num; ++i) { - aligned_space[i] = en[i].node->get_aligned_space(); - bounds[i] = en[i].node->bounds; - child[i] = en[i].encodeIdx(); - } - pack_unaligned_node(e.idx, - aligned_space, - bounds, - child, - e.node->visibility, - e.node->time_from, - e.node->time_to, - num); -} - -void BVH8::pack_unaligned_node(int idx, - const Transform *aligned_space, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num) -{ - float8 data[BVH_UNALIGNED_ONODE_SIZE]; - memset(data, 0, sizeof(data)); - - data[0].a = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED); - data[0].b = time_from; - data[0].c = time_to; - - for (int i = 0; i < num; i++) { - Transform space = BVHUnaligned::compute_node_transform(bounds[i], aligned_space[i]); - - data[1][i] = space.x.x; - data[2][i] = space.x.y; - data[3][i] = space.x.z; - - data[4][i] = space.y.x; - data[5][i] = space.y.y; - data[6][i] = space.y.z; - - data[7][i] = space.z.x; - data[8][i] = space.z.y; - data[9][i] = space.z.z; - - data[10][i] = space.x.w; - data[11][i] = space.y.w; - data[12][i] = space.z.w; - - data[13][i] = __int_as_float(child[i]); - } - - for (int i = num; i < 8; i++) { - /* We store BB which would never be recorded as intersection - * so kernel might safely assume there are always 4 child nodes. - */ - - data[1][i] = NAN; - data[2][i] = NAN; - data[3][i] = NAN; - - data[4][i] = NAN; - data[5][i] = NAN; - data[6][i] = NAN; - - data[7][i] = NAN; - data[8][i] = NAN; - data[9][i] = NAN; - - data[10][i] = NAN; - data[11][i] = NAN; - data[12][i] = NAN; - - data[13][i] = __int_as_float(0); - } - - memcpy(&pack.nodes[idx], data, sizeof(float4) * BVH_UNALIGNED_ONODE_SIZE); -} - -/* Quad SIMD Nodes */ - -void BVH8::pack_nodes(const BVHNode *root) -{ - /* Calculate size of the arrays required. */ - const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT); - const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); - assert(num_leaf_nodes <= num_nodes); - const size_t num_inner_nodes = num_nodes - num_leaf_nodes; - size_t node_size; - if (params.use_unaligned_nodes) { - const size_t num_unaligned_nodes = root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT); - node_size = (num_unaligned_nodes * BVH_UNALIGNED_ONODE_SIZE) + - (num_inner_nodes - num_unaligned_nodes) * BVH_ONODE_SIZE; - } - else { - node_size = num_inner_nodes * BVH_ONODE_SIZE; - } - /* Resize arrays. */ - pack.nodes.clear(); - pack.leaf_nodes.clear(); - /* For top level BVH, first merge existing BVH's so we know the offsets. */ - if (params.top_level) { - pack_instances(node_size, num_leaf_nodes * BVH_ONODE_LEAF_SIZE); - } - else { - pack.nodes.resize(node_size); - pack.leaf_nodes.resize(num_leaf_nodes * BVH_ONODE_LEAF_SIZE); - } - - int nextNodeIdx = 0, nextLeafNodeIdx = 0; - - vector<BVHStackEntry> stack; - stack.reserve(BVHParams::MAX_DEPTH * 2); - if (root->is_leaf()) { - stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++)); - } - else { - stack.push_back(BVHStackEntry(root, nextNodeIdx)); - nextNodeIdx += root->has_unaligned() ? BVH_UNALIGNED_ONODE_SIZE : BVH_ONODE_SIZE; - } - - while (stack.size()) { - BVHStackEntry e = stack.back(); - stack.pop_back(); - - if (e.node->is_leaf()) { - /* leaf node */ - const LeafNode *leaf = reinterpret_cast<const LeafNode *>(e.node); - pack_leaf(e, leaf); - } - else { - /* Inner node. */ - /* Collect nodes. */ - const BVHNode *children[8]; - int num_children = e.node->num_children(); - /* Push entries on the stack. */ - for (int i = 0; i < num_children; ++i) { - int idx; - children[i] = e.node->get_child(i); - if (children[i]->is_leaf()) { - idx = nextLeafNodeIdx++; - } - else { - idx = nextNodeIdx; - nextNodeIdx += children[i]->has_unaligned() ? BVH_UNALIGNED_ONODE_SIZE : BVH_ONODE_SIZE; - } - stack.push_back(BVHStackEntry(children[i], idx)); - } - /* Set node. */ - pack_inner(e, &stack[stack.size() - num_children], num_children); - } - } - - assert(node_size == nextNodeIdx); - /* Root index to start traversal at, to handle case of single leaf node. */ - pack.root_index = (root->is_leaf()) ? -1 : 0; -} - -void BVH8::refit_nodes() -{ - assert(!params.top_level); - - BoundBox bbox = BoundBox::empty; - uint visibility = 0; - refit_node(0, (pack.root_index == -1) ? true : false, bbox, visibility); -} - -void BVH8::refit_node(int idx, bool leaf, BoundBox &bbox, uint &visibility) -{ - if (leaf) { - int4 *data = &pack.leaf_nodes[idx]; - int4 c = data[0]; - /* Refit leaf node. */ - for (int prim = c.x; prim < c.y; prim++) { - int pidx = pack.prim_index[prim]; - int tob = pack.prim_object[prim]; - Object *ob = objects[tob]; - - if (pidx == -1) { - /* Object instance. */ - bbox.grow(ob->bounds); - } - else { - /* Primitives. */ - if (pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) { - /* Curves. */ - const Hair *hair = static_cast<const Hair *>(ob->geometry); - int prim_offset = (params.top_level) ? hair->prim_offset : 0; - Hair::Curve curve = hair->get_curve(pidx - prim_offset); - int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]); - - curve.bounds_grow(k, &hair->curve_keys[0], &hair->curve_radius[0], bbox); - - /* Motion curves. */ - if (hair->use_motion_blur) { - Attribute *attr = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - - if (attr) { - size_t hair_size = hair->curve_keys.size(); - size_t steps = hair->motion_steps - 1; - float3 *key_steps = attr->data_float3(); - - for (size_t i = 0; i < steps; i++) { - curve.bounds_grow(k, key_steps + i * hair_size, &hair->curve_radius[0], bbox); - } - } - } - } - else { - /* Triangles. */ - const Mesh *mesh = static_cast<const Mesh *>(ob->geometry); - int prim_offset = (params.top_level) ? mesh->prim_offset : 0; - Mesh::Triangle triangle = mesh->get_triangle(pidx - prim_offset); - const float3 *vpos = &mesh->verts[0]; - - triangle.bounds_grow(vpos, bbox); - - /* Motion triangles. */ - if (mesh->use_motion_blur) { - Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - - if (attr) { - size_t mesh_size = mesh->verts.size(); - size_t steps = mesh->motion_steps - 1; - float3 *vert_steps = attr->data_float3(); - - for (size_t i = 0; i < steps; i++) { - triangle.bounds_grow(vert_steps + i * mesh_size, bbox); - } - } - } - } - } - - visibility |= ob->visibility; - } - - float4 leaf_data[BVH_ONODE_LEAF_SIZE]; - leaf_data[0].x = __int_as_float(c.x); - leaf_data[0].y = __int_as_float(c.y); - leaf_data[0].z = __uint_as_float(visibility); - leaf_data[0].w = __uint_as_float(c.w); - memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4) * BVH_ONODE_LEAF_SIZE); - } - else { - float8 *data = (float8 *)&pack.nodes[idx]; - bool is_unaligned = (__float_as_uint(data[0].a) & PATH_RAY_NODE_UNALIGNED) != 0; - /* Refit inner node, set bbox from children. */ - BoundBox child_bbox[8] = {BoundBox::empty, - BoundBox::empty, - BoundBox::empty, - BoundBox::empty, - BoundBox::empty, - BoundBox::empty, - BoundBox::empty, - BoundBox::empty}; - int child[8]; - uint child_visibility[8] = {0}; - int num_nodes = 0; - - for (int i = 0; i < 8; ++i) { - child[i] = __float_as_int(data[(is_unaligned) ? 13 : 7][i]); - - if (child[i] != 0) { - refit_node((child[i] < 0) ? -child[i] - 1 : child[i], - (child[i] < 0), - child_bbox[i], - child_visibility[i]); - ++num_nodes; - bbox.grow(child_bbox[i]); - visibility |= child_visibility[i]; - } - } - - if (is_unaligned) { - Transform aligned_space[8] = {transform_identity(), - transform_identity(), - transform_identity(), - transform_identity(), - transform_identity(), - transform_identity(), - transform_identity(), - transform_identity()}; - pack_unaligned_node( - idx, aligned_space, child_bbox, child, visibility, 0.0f, 1.0f, num_nodes); - } - else { - pack_aligned_node(idx, child_bbox, child, visibility, 0.0f, 1.0f, num_nodes); - } - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh8.h b/intern/cycles/bvh/bvh8.h deleted file mode 100644 index d23fa528e3e..00000000000 --- a/intern/cycles/bvh/bvh8.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Original code Copyright 2017, Intel Corporation - * Modifications Copyright 2018, Blender Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __BVH8_H__ -#define __BVH8_H__ - -#include "bvh/bvh.h" -#include "bvh/bvh_params.h" - -#include "util/util_types.h" -#include "util/util_vector.h" - -CCL_NAMESPACE_BEGIN - -class BVHNode; -struct BVHStackEntry; -class BVHParams; -class BoundBox; -class LeafNode; -class Object; -class Progress; - -#define BVH_ONODE_SIZE 16 -#define BVH_ONODE_LEAF_SIZE 1 -#define BVH_UNALIGNED_ONODE_SIZE 28 - -/* BVH8 - * - * Octo BVH, with each node having eight children, to use with SIMD instructions. - */ -class BVH8 : public BVH { - protected: - /* constructor */ - friend class BVH; - BVH8(const BVHParams ¶ms, - const vector<Geometry *> &geometry, - const vector<Object *> &objects); - - /* Building process. */ - virtual BVHNode *widen_children_nodes(const BVHNode *root) override; - - /* pack */ - void pack_nodes(const BVHNode *root) override; - - void pack_leaf(const BVHStackEntry &e, const LeafNode *leaf); - void pack_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num); - - void pack_aligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num); - void pack_aligned_node(int idx, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num); - - void pack_unaligned_inner(const BVHStackEntry &e, const BVHStackEntry *en, int num); - void pack_unaligned_node(int idx, - const Transform *aligned_space, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num); - - /* refit */ - void refit_nodes() override; - void refit_node(int idx, bool leaf, BoundBox &bbox, uint &visibility); -}; - -CCL_NAMESPACE_END - -#endif /* __BVH8_H__ */ diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp index 814b5ced5d2..86ab7b00815 100644 --- a/intern/cycles/bvh/bvh_build.cpp +++ b/intern/cycles/bvh/bvh_build.cpp @@ -39,48 +39,6 @@ CCL_NAMESPACE_BEGIN -/* BVH Build Task */ - -class BVHBuildTask : public Task { - public: - BVHBuildTask( - BVHBuild *build, InnerNode *node, int child, const BVHObjectBinning &range, int level) - : range_(range) - { - run = function_bind(&BVHBuild::thread_build_node, build, node, child, &range_, level); - } - - private: - BVHObjectBinning range_; -}; - -class BVHSpatialSplitBuildTask : public Task { - public: - BVHSpatialSplitBuildTask(BVHBuild *build, - InnerNode *node, - int child, - const BVHRange &range, - const vector<BVHReference> &references, - int level) - : range_(range), - references_(references.begin() + range.start(), references.begin() + range.end()) - { - range_.set_start(0); - run = function_bind(&BVHBuild::thread_build_spatial_split_node, - build, - node, - child, - &range_, - &references_, - level, - _1); - } - - private: - BVHRange range_; - vector<BVHReference> references_; -}; - /* Constructor / Destructor */ BVHBuild::BVHBuild(const vector<Object *> &objects_, @@ -201,6 +159,13 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox ¢er, Hair *hair if (hair->has_motion_blur()) { curve_attr_mP = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); } + + const PrimitiveType primitive_type = + (curve_attr_mP != NULL) ? + ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_MOTION_CURVE_RIBBON : + PRIMITIVE_MOTION_CURVE_THICK) : + ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_CURVE_RIBBON : PRIMITIVE_CURVE_THICK); + const size_t num_curves = hair->num_curves(); for (uint j = 0; j < num_curves; j++) { const Hair::Curve curve = hair->get_curve(j); @@ -211,7 +176,7 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox ¢er, Hair *hair BoundBox bounds = BoundBox::empty; curve.bounds_grow(k, &hair->curve_keys[0], curve_radius, bounds); if (bounds.valid()) { - int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_CURVE, k); + int packed_type = PRIMITIVE_PACK_SEGMENT(primitive_type, k); references.push_back(BVHReference(bounds, j, i, packed_type)); root.grow(bounds); center.grow(bounds.center2()); @@ -232,7 +197,7 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox ¢er, Hair *hair curve.bounds_grow(k, key_steps + step * num_keys, curve_radius, bounds); } if (bounds.valid()) { - int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_MOTION_CURVE, k); + int packed_type = PRIMITIVE_PACK_SEGMENT(primitive_type, k); references.push_back(BVHReference(bounds, j, i, packed_type)); root.grow(bounds); center.grow(bounds.center2()); @@ -288,7 +253,7 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox ¢er, Hair *hair bounds.grow(curr_bounds); if (bounds.valid()) { const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1; - int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_MOTION_CURVE, k); + int packed_type = PRIMITIVE_PACK_SEGMENT(primitive_type, k); references.push_back(BVHReference(bounds, j, i, packed_type, prev_time, curr_time)); root.grow(bounds); center.grow(bounds.center2()); @@ -423,22 +388,6 @@ BVHNode *BVHBuild::run() } spatial_min_overlap = root.bounds().safe_area() * params.spatial_split_alpha; - if (params.use_spatial_split) { - /* NOTE: The API here tries to be as much ready for multi-threaded build - * as possible, but at the same time it tries not to introduce any - * changes in behavior for until all refactoring needed for threading is - * finished. - * - * So we currently allocate single storage for now, which is only used by - * the only thread working on the spatial BVH build. - */ - spatial_storage.resize(TaskScheduler::num_threads() + 1); - size_t num_bins = max(root.size(), (int)BVHParams::NUM_SPATIAL_BINS) - 1; - foreach (BVHSpatialStorage &storage, spatial_storage) { - storage.right_bounds.clear(); - } - spatial_storage[0].right_bounds.resize(num_bins); - } spatial_free_index = 0; need_prim_time = params.num_motion_curve_steps > 0 || params.num_motion_triangle_steps > 0; @@ -465,7 +414,8 @@ BVHNode *BVHBuild::run() if (params.use_spatial_split) { /* Perform multithreaded spatial split build. */ - rootnode = build_node(root, &references, 0, 0); + BVHSpatialStorage *local_storage = &spatial_storage.local(); + rootnode = build_node(root, references, 0, local_storage); task_pool.wait_work(); } else { @@ -475,6 +425,9 @@ BVHNode *BVHBuild::run() task_pool.wait_work(); } + /* clean up temporary memory usage by threads */ + spatial_storage.clear(); + /* delete if we canceled */ if (rootnode) { if (progress.get_cancel()) { @@ -529,41 +482,46 @@ void BVHBuild::progress_update() progress_start_time = time_dt(); } -void BVHBuild::thread_build_node(InnerNode *inner, int child, BVHObjectBinning *range, int level) +void BVHBuild::thread_build_node(InnerNode *inner, + int child, + const BVHObjectBinning &range, + int level) { if (progress.get_cancel()) return; /* build nodes */ - BVHNode *node = build_node(*range, level); + BVHNode *node = build_node(range, level); /* set child in inner node */ inner->children[child] = node; /* update progress */ - if (range->size() < THREAD_TASK_SIZE) { + if (range.size() < THREAD_TASK_SIZE) { /*rotate(node, INT_MAX, 5);*/ thread_scoped_lock lock(build_mutex); - progress_count += range->size(); + progress_count += range.size(); progress_update(); } } void BVHBuild::thread_build_spatial_split_node(InnerNode *inner, int child, - BVHRange *range, - vector<BVHReference> *references, - int level, - int thread_id) + const BVHRange &range, + vector<BVHReference> &references, + int level) { if (progress.get_cancel()) { return; } + /* Get per-thread memory for spatial split. */ + BVHSpatialStorage *local_storage = &spatial_storage.local(); + /* build nodes */ - BVHNode *node = build_node(*range, references, level, thread_id); + BVHNode *node = build_node(range, references, level, local_storage); /* set child in inner node */ inner->children[child] = node; @@ -586,14 +544,22 @@ bool BVHBuild::range_within_max_leaf_size(const BVHRange &range, for (int i = 0; i < size; i++) { const BVHReference &ref = references[range.start() + i]; - if (ref.prim_type() & PRIMITIVE_CURVE) - num_curves++; - if (ref.prim_type() & PRIMITIVE_MOTION_CURVE) - num_motion_curves++; - else if (ref.prim_type() & PRIMITIVE_TRIANGLE) - num_triangles++; - else if (ref.prim_type() & PRIMITIVE_MOTION_TRIANGLE) - num_motion_triangles++; + if (ref.prim_type() & PRIMITIVE_ALL_CURVE) { + if (ref.prim_type() & PRIMITIVE_ALL_MOTION) { + num_motion_curves++; + } + else { + num_curves++; + } + } + else if (ref.prim_type() & PRIMITIVE_ALL_TRIANGLE) { + if (ref.prim_type() & PRIMITIVE_ALL_MOTION) { + num_motion_triangles++; + } + else { + num_triangles++; + } + } } return (num_triangles <= params.max_triangle_leaf_size) && @@ -675,8 +641,8 @@ BVHNode *BVHBuild::build_node(const BVHObjectBinning &range, int level) /* Threaded build */ inner = new InnerNode(bounds); - task_pool.push(new BVHBuildTask(this, inner, 0, left, level + 1), true); - task_pool.push(new BVHBuildTask(this, inner, 1, right, level + 1), true); + task_pool.push([=] { thread_build_node(inner, 0, left, level + 1); }); + task_pool.push([=] { thread_build_node(inner, 1, right, level + 1); }); } if (do_unalinged_split) { @@ -688,9 +654,9 @@ BVHNode *BVHBuild::build_node(const BVHObjectBinning &range, int level) /* multithreaded spatial split builder */ BVHNode *BVHBuild::build_node(const BVHRange &range, - vector<BVHReference> *references, + vector<BVHReference> &references, int level, - int thread_id) + BVHSpatialStorage *storage) { /* Update progress. * @@ -707,18 +673,17 @@ BVHNode *BVHBuild::build_node(const BVHRange &range, if (!(range.size() > 0 && params.top_level && level == 0)) { if (params.small_enough_for_leaf(range.size(), level)) { progress_count += range.size(); - return create_leaf_node(range, *references); + return create_leaf_node(range, references); } } /* Perform splitting test. */ - BVHSpatialStorage *storage = &spatial_storage[thread_id]; BVHMixedSplit split(this, storage, range, references, level); if (!(range.size() > 0 && params.top_level && level == 0)) { if (split.no_split) { progress_count += range.size(); - return create_leaf_node(range, *references); + return create_leaf_node(range, references); } } float leafSAH = params.sah_primitive_cost * split.leafSAH; @@ -731,7 +696,7 @@ BVHNode *BVHBuild::build_node(const BVHRange &range, Transform aligned_space; bool do_unalinged_split = false; if (params.use_unaligned_nodes && splitSAH > params.unaligned_split_threshold * leafSAH) { - aligned_space = unaligned_heuristic.compute_aligned_space(range, &references->at(0)); + aligned_space = unaligned_heuristic.compute_aligned_space(range, &references.at(0)); unaligned_split = BVHMixedSplit( this, storage, range, references, level, &unaligned_heuristic, &aligned_space); /* unalignedLeafSAH = params.sah_primitive_cost * split.leafSAH; */ @@ -757,8 +722,7 @@ BVHNode *BVHBuild::build_node(const BVHRange &range, BoundBox bounds; if (do_unalinged_split) { - bounds = unaligned_heuristic.compute_aligned_boundbox( - range, &references->at(0), aligned_space); + bounds = unaligned_heuristic.compute_aligned_boundbox(range, &references.at(0), aligned_space); } else { bounds = range.bounds(); @@ -770,24 +734,35 @@ BVHNode *BVHBuild::build_node(const BVHRange &range, /* Local build. */ /* Build left node. */ - vector<BVHReference> copy(references->begin() + right.start(), - references->begin() + right.end()); + vector<BVHReference> right_references(references.begin() + right.start(), + references.begin() + right.end()); right.set_start(0); - BVHNode *leftnode = build_node(left, references, level + 1, thread_id); + BVHNode *leftnode = build_node(left, references, level + 1, storage); /* Build right node. */ - BVHNode *rightnode = build_node(right, ©, level + 1, thread_id); + BVHNode *rightnode = build_node(right, right_references, level + 1, storage); inner = new InnerNode(bounds, leftnode, rightnode); } else { /* Threaded build. */ inner = new InnerNode(bounds); - task_pool.push(new BVHSpatialSplitBuildTask(this, inner, 0, left, *references, level + 1), - true); - task_pool.push(new BVHSpatialSplitBuildTask(this, inner, 1, right, *references, level + 1), - true); + + vector<BVHReference> left_references(references.begin() + left.start(), + references.begin() + left.end()); + vector<BVHReference> right_references(references.begin() + right.start(), + references.begin() + right.end()); + right.set_start(0); + + /* Create tasks for left and right nodes, using copy for most arguments and + * move for reference to avoid memory copies. */ + task_pool.push([=, refs = std::move(left_references)]() mutable { + thread_build_spatial_split_node(inner, 0, left, refs, level + 1); + }); + task_pool.push([=, refs = std::move(right_references)]() mutable { + thread_build_spatial_split_node(inner, 1, right, refs, level + 1); + }); } if (do_unalinged_split) { diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h index 3fe4c3799e2..c35af083fbd 100644 --- a/intern/cycles/bvh/bvh_build.h +++ b/intern/cycles/bvh/bvh_build.h @@ -74,9 +74,9 @@ class BVHBuild { /* Building. */ BVHNode *build_node(const BVHRange &range, - vector<BVHReference> *references, + vector<BVHReference> &references, int level, - int thread_id); + BVHSpatialStorage *storage); BVHNode *build_node(const BVHObjectBinning &range, int level); BVHNode *create_leaf_node(const BVHRange &range, const vector<BVHReference> &references); BVHNode *create_object_leaf_nodes(const BVHReference *ref, int start, int num); @@ -86,13 +86,12 @@ class BVHBuild { /* Threads. */ enum { THREAD_TASK_SIZE = 4096 }; - void thread_build_node(InnerNode *node, int child, BVHObjectBinning *range, int level); + void thread_build_node(InnerNode *node, int child, const BVHObjectBinning &range, int level); void thread_build_spatial_split_node(InnerNode *node, int child, - BVHRange *range, - vector<BVHReference> *references, - int level, - int thread_id); + const BVHRange &range, + vector<BVHReference> &references, + int level); thread_mutex build_mutex; /* Progress. */ @@ -127,7 +126,7 @@ class BVHBuild { /* Spatial splitting. */ float spatial_min_overlap; - vector<BVHSpatialStorage> spatial_storage; + enumerable_thread_specific<BVHSpatialStorage> spatial_storage; size_t spatial_free_index; thread_spin_lock spatial_spin_lock; diff --git a/intern/cycles/bvh/bvh_embree.cpp b/intern/cycles/bvh/bvh_embree.cpp index 6735202835b..17e1f86a589 100644 --- a/intern/cycles/bvh/bvh_embree.cpp +++ b/intern/cycles/bvh/bvh_embree.cpp @@ -47,9 +47,11 @@ # include "render/hair.h" # include "render/mesh.h" # include "render/object.h" + # include "util/util_foreach.h" # include "util/util_logging.h" # include "util/util_progress.h" +# include "util/util_stats.h" CCL_NAMESPACE_BEGIN @@ -65,30 +67,9 @@ static_assert(Object::MAX_MOTION_STEPS == Geometry::MAX_MOTION_STEPS, * as well as filtering for volume objects happen here. * Cycles' own BVH does that directly inside the traversal calls. */ -static void rtc_filter_func(const RTCFilterFunctionNArguments *args) -{ - /* Current implementation in Cycles assumes only single-ray intersection queries. */ - assert(args->N == 1); - - const RTCRay *ray = (RTCRay *)args->ray; - const RTCHit *hit = (RTCHit *)args->hit; - CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt; - KernelGlobals *kg = ctx->kg; - - /* Check if there is backfacing hair to ignore. */ - if (IS_HAIR(hit->geomID) && (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) && - !(kernel_data.curve.curveflags & CURVE_KN_BACKFACING) && - !(kernel_data.curve.curveflags & CURVE_KN_RIBBONS)) { - if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z), - make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) { - *args->valid = 0; - return; - } - } -} - static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args) { + /* Current implementation in Cycles assumes only single-ray intersection queries. */ assert(args->N == 1); const RTCRay *ray = (RTCRay *)args->ray; @@ -96,17 +77,6 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args) CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt; KernelGlobals *kg = ctx->kg; - /* For all ray types: Check if there is backfacing hair to ignore */ - if (IS_HAIR(hit->geomID) && (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) && - !(kernel_data.curve.curveflags & CURVE_KN_BACKFACING) && - !(kernel_data.curve.curveflags & CURVE_KN_RIBBONS)) { - if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z), - make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) { - *args->valid = 0; - return; - } - } - switch (ctx->type) { case CCLIntersectContext::RAY_SHADOW_ALL: { /* Append the intersection to the end of the array. */ @@ -168,7 +138,7 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args) } /* Ignore curves. */ - if (hit->geomID & 1) { + if (IS_HAIR(hit->geomID)) { /* This tells Embree to continue tracing. */ *args->valid = 0; break; @@ -249,6 +219,34 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args) } } +static void rtc_filter_func_thick_curve(const RTCFilterFunctionNArguments *args) +{ + const RTCRay *ray = (RTCRay *)args->ray; + RTCHit *hit = (RTCHit *)args->hit; + + /* Always ignore backfacing intersections. */ + if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z), + make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) { + *args->valid = 0; + return; + } +} + +static void rtc_filter_occluded_func_thick_curve(const RTCFilterFunctionNArguments *args) +{ + const RTCRay *ray = (RTCRay *)args->ray; + RTCHit *hit = (RTCHit *)args->hit; + + /* Always ignore backfacing intersections. */ + if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z), + make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) { + *args->valid = 0; + return; + } + + rtc_filter_occluded_func(args); +} + static size_t unaccounted_mem = 0; static bool rtc_memory_monitor_func(void *userPtr, const ssize_t bytes, const bool) @@ -326,8 +324,6 @@ BVHEmbree::BVHEmbree(const BVHParams ¶ms_, stats(NULL), curve_subdivisions(params.curve_subdivisions), build_quality(RTC_BUILD_QUALITY_REFIT), - use_curves(params_.curve_flags & CURVE_KN_INTERPOLATE), - use_ribbons(params.curve_flags & CURVE_KN_RIBBONS), dynamic_scene(true) { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); @@ -653,7 +649,6 @@ void BVHEmbree::add_triangles(const Object *ob, const Mesh *mesh, int i) } rtcSetGeometryUserData(geom_id, (void *)prim_offset); - rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func); rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func); rtcSetGeometryMask(geom_id, ob->visibility_for_tracing()); @@ -724,9 +719,7 @@ void BVHEmbree::update_curve_vertex_buffer(RTCGeometry geom_id, const Hair *hair /* Catmull-Rom splines need extra CVs at the beginning and end of each curve. */ size_t num_keys_embree = num_keys; - if (use_curves) { - num_keys_embree += num_curves * 2; - } + num_keys_embree += num_curves * 2; /* Copy the CV data to Embree */ const int t_mid = (num_motion_steps - 1) / 2; @@ -746,45 +739,22 @@ void BVHEmbree::update_curve_vertex_buffer(RTCGeometry geom_id, const Hair *hair assert(rtc_verts); if (rtc_verts) { - if (use_curves) { - const size_t num_curves = hair->num_curves(); - for (size_t j = 0; j < num_curves; ++j) { - Hair::Curve c = hair->get_curve(j); - int fk = c.first_key; - int k = 1; - for (; k < c.num_keys + 1; ++k, ++fk) { - rtc_verts[k] = float3_to_float4(verts[fk]); - rtc_verts[k].w = curve_radius[fk]; - } - /* Duplicate Embree's Catmull-Rom spline CVs at the start and end of each curve. */ - rtc_verts[0] = rtc_verts[1]; - rtc_verts[k] = rtc_verts[k - 1]; - rtc_verts += c.num_keys + 2; - } - } - else { - for (size_t j = 0; j < num_keys_embree; ++j) { - rtc_verts[j] = float3_to_float4(verts[j]); - rtc_verts[j].w = curve_radius[j]; + const size_t num_curves = hair->num_curves(); + for (size_t j = 0; j < num_curves; ++j) { + Hair::Curve c = hair->get_curve(j); + int fk = c.first_key; + int k = 1; + for (; k < c.num_keys + 1; ++k, ++fk) { + rtc_verts[k] = float3_to_float4(verts[fk]); + rtc_verts[k].w = curve_radius[fk]; } + /* Duplicate Embree's Catmull-Rom spline CVs at the start and end of each curve. */ + rtc_verts[0] = rtc_verts[1]; + rtc_verts[k] = rtc_verts[k - 1]; + rtc_verts += c.num_keys + 2; } } } -# if RTC_VERSION >= 30900 - if (!use_curves) { - unsigned char *flags = (unsigned char *)rtcSetNewGeometryBuffer(geom_id, - RTC_BUFFER_TYPE_FLAGS, - 0, - RTC_FORMAT_UCHAR, - sizeof(unsigned char), - num_keys_embree); - flags[0] = RTC_CURVE_FLAG_NEIGHBOR_RIGHT; - ::memset(flags + 1, - RTC_CURVE_FLAG_NEIGHBOR_RIGHT | RTC_CURVE_FLAG_NEIGHBOR_RIGHT, - num_keys_embree - 2); - flags[num_keys_embree - 1] = RTC_CURVE_FLAG_NEIGHBOR_LEFT; - } -# endif } void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i) @@ -800,6 +770,12 @@ void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i) } const size_t num_motion_steps = min(num_geometry_motion_steps, RTC_MAX_TIME_STEP_COUNT); + const PrimitiveType primitive_type = + (num_motion_steps > 1) ? + ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_MOTION_CURVE_RIBBON : + PRIMITIVE_MOTION_CURVE_THICK) : + ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_CURVE_RIBBON : PRIMITIVE_CURVE_THICK); + assert(num_geometry_motion_steps <= RTC_MAX_TIME_STEP_COUNT); const size_t num_curves = hair->num_curves(); @@ -820,21 +796,12 @@ void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i) size_t prim_tri_index_size = pack.prim_index.size(); pack.prim_tri_index.resize(prim_tri_index_size + num_segments); -# if RTC_VERSION >= 30900 - enum RTCGeometryType type = (!use_curves) ? - (use_ribbons ? RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE : - RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE) : - (use_ribbons ? RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE : - RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE); -# else - enum RTCGeometryType type = (!use_curves) ? - RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE : - (use_ribbons ? RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE : - RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE); -# endif + enum RTCGeometryType type = (hair->curve_shape == CURVE_RIBBON ? + RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE : + RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE); RTCGeometry geom_id = rtcNewGeometry(rtc_shared_device, type); - rtcSetGeometryTessellationRate(geom_id, curve_subdivisions); + rtcSetGeometryTessellationRate(geom_id, curve_subdivisions + 1); unsigned *rtc_indices = (unsigned *)rtcSetNewGeometryBuffer( geom_id, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT, sizeof(int), num_segments); size_t rtc_index = 0; @@ -842,14 +809,11 @@ void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i) Hair::Curve c = hair->get_curve(j); for (size_t k = 0; k < c.num_segments(); ++k) { rtc_indices[rtc_index] = c.first_key + k; - if (use_curves) { - /* Room for extra CVs at Catmull-Rom splines. */ - rtc_indices[rtc_index] += j * 2; - } + /* Room for extra CVs at Catmull-Rom splines. */ + rtc_indices[rtc_index] += j * 2; /* Cycles specific data. */ pack.prim_object[prim_object_size + rtc_index] = i; - pack.prim_type[prim_type_size + rtc_index] = (PRIMITIVE_PACK_SEGMENT( - num_motion_steps > 1 ? PRIMITIVE_MOTION_CURVE : PRIMITIVE_CURVE, k)); + pack.prim_type[prim_type_size + rtc_index] = (PRIMITIVE_PACK_SEGMENT(primitive_type, k)); pack.prim_index[prim_index_size + rtc_index] = j; pack.prim_tri_index[prim_tri_index_size + rtc_index] = rtc_index; @@ -863,8 +827,13 @@ void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i) update_curve_vertex_buffer(geom_id, hair); rtcSetGeometryUserData(geom_id, (void *)prim_offset); - rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func); - rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func); + if (hair->curve_shape == CURVE_RIBBON) { + rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func); + } + else { + rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func_thick_curve); + rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func_thick_curve); + } rtcSetGeometryMask(geom_id, ob->visibility_for_tracing()); rtcCommitGeometry(geom_id); diff --git a/intern/cycles/bvh/bvh_embree.h b/intern/cycles/bvh/bvh_embree.h index eb121d060b7..f60a1ca0102 100644 --- a/intern/cycles/bvh/bvh_embree.h +++ b/intern/cycles/bvh/bvh_embree.h @@ -81,7 +81,7 @@ class BVHEmbree : public BVH { vector<RTCScene> delayed_delete_scenes; int curve_subdivisions; enum RTCBuildQuality build_quality; - bool use_curves, use_ribbons, dynamic_scene; + bool dynamic_scene; }; CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh_optix.cpp b/intern/cycles/bvh/bvh_optix.cpp index 740994b2ebc..ccb7ae08625 100644 --- a/intern/cycles/bvh/bvh_optix.cpp +++ b/intern/cycles/bvh/bvh_optix.cpp @@ -18,10 +18,14 @@ #ifdef WITH_OPTIX # include "bvh/bvh_optix.h" + +# include "device/device.h" + # include "render/geometry.h" # include "render/hair.h" # include "render/mesh.h" # include "render/object.h" + # include "util/util_foreach.h" # include "util/util_logging.h" # include "util/util_progress.h" @@ -73,9 +77,12 @@ void BVHOptiX::pack_blas() // 'pack.prim_time' is only used in geom_curve_intersect.h // It is not needed because of OPTIX_MOTION_FLAG_[START|END]_VANISH - uint type = PRIMITIVE_CURVE; - if (hair->use_motion_blur && hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) - type = PRIMITIVE_MOTION_CURVE; + uint type = (hair->use_motion_blur && + hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) ? + ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_MOTION_CURVE_RIBBON : + PRIMITIVE_MOTION_CURVE_THICK) : + ((hair->curve_shape == CURVE_RIBBON) ? PRIMITIVE_CURVE_RIBBON : + PRIMITIVE_CURVE_THICK); for (size_t j = 0; j < num_curves; ++j) { const Hair::Curve curve = hair->get_curve(j); diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h index 5e2c4b63f1b..1a50742dc33 100644 --- a/intern/cycles/bvh/bvh_params.h +++ b/intern/cycles/bvh/bvh_params.h @@ -89,7 +89,6 @@ class BVHParams { int bvh_type; /* These are needed for Embree. */ - int curve_flags; int curve_subdivisions; /* fixed parameters */ @@ -122,7 +121,6 @@ class BVHParams { bvh_type = 0; - curve_flags = 0; curve_subdivisions = 4; } diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp index 4498a759c08..b01785b547a 100644 --- a/intern/cycles/bvh/bvh_sort.cpp +++ b/intern/cycles/bvh/bvh_sort.cpp @@ -88,18 +88,6 @@ static void bvh_reference_sort_threaded(TaskPool *task_pool, const int job_end, const BVHReferenceCompare &compare); -class BVHSortTask : public Task { - public: - BVHSortTask(TaskPool *task_pool, - BVHReference *data, - const int job_start, - const int job_end, - const BVHReferenceCompare &compare) - { - run = function_bind(bvh_reference_sort_threaded, task_pool, data, job_start, job_end, compare); - } -}; - /* Multi-threaded reference sort. */ static void bvh_reference_sort_threaded(TaskPool *task_pool, BVHReference *data, @@ -158,7 +146,8 @@ static void bvh_reference_sort_threaded(TaskPool *task_pool, have_work = false; if (left < end) { if (start < right) { - task_pool->push(new BVHSortTask(task_pool, data, left, end, compare), true); + task_pool->push( + function_bind(bvh_reference_sort_threaded, task_pool, data, left, end, compare)); } else { start = left; diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp index acdca0f13ad..4b21f852d7a 100644 --- a/intern/cycles/bvh/bvh_split.cpp +++ b/intern/cycles/bvh/bvh_split.cpp @@ -33,7 +33,7 @@ CCL_NAMESPACE_BEGIN BVHObjectSplit::BVHObjectSplit(BVHBuild *builder, BVHSpatialStorage *storage, const BVHRange &range, - vector<BVHReference> *references, + vector<BVHReference> &references, float nodeSAH, const BVHUnaligned *unaligned_heuristic, const Transform *aligned_space) @@ -43,7 +43,7 @@ BVHObjectSplit::BVHObjectSplit(BVHBuild *builder, left_bounds(BoundBox::empty), right_bounds(BoundBox::empty), storage_(storage), - references_(references), + references_(&references), unaligned_heuristic_(unaligned_heuristic), aligned_space_(aligned_space) { @@ -133,7 +133,7 @@ void BVHObjectSplit::split(BVHRange &left, BVHRange &right, const BVHRange &rang BVHSpatialSplit::BVHSpatialSplit(const BVHBuild &builder, BVHSpatialStorage *storage, const BVHRange &range, - vector<BVHReference> *references, + vector<BVHReference> &references, float nodeSAH, const BVHUnaligned *unaligned_heuristic, const Transform *aligned_space) @@ -141,7 +141,7 @@ BVHSpatialSplit::BVHSpatialSplit(const BVHBuild &builder, dim(0), pos(0.0f), storage_(storage), - references_(references), + references_(&references), unaligned_heuristic_(unaligned_heuristic), aligned_space_(aligned_space) { @@ -152,7 +152,7 @@ BVHSpatialSplit::BVHSpatialSplit(const BVHBuild &builder, } else { range_bounds = unaligned_heuristic->compute_aligned_boundbox( - range, &references->at(0), *aligned_space); + range, &references_->at(0), *aligned_space); } float3 origin = range_bounds.min; diff --git a/intern/cycles/bvh/bvh_split.h b/intern/cycles/bvh/bvh_split.h index 5f2e41cf343..28ff0e05fc3 100644 --- a/intern/cycles/bvh/bvh_split.h +++ b/intern/cycles/bvh/bvh_split.h @@ -44,7 +44,7 @@ class BVHObjectSplit { BVHObjectSplit(BVHBuild *builder, BVHSpatialStorage *storage, const BVHRange &range, - vector<BVHReference> *references, + vector<BVHReference> &references, float nodeSAH, const BVHUnaligned *unaligned_heuristic = NULL, const Transform *aligned_space = NULL); @@ -82,7 +82,7 @@ class BVHSpatialSplit { BVHSpatialSplit(const BVHBuild &builder, BVHSpatialStorage *storage, const BVHRange &range, - vector<BVHReference> *references, + vector<BVHReference> &references, float nodeSAH, const BVHUnaligned *unaligned_heuristic = NULL, const Transform *aligned_space = NULL); @@ -187,7 +187,7 @@ class BVHMixedSplit { __forceinline BVHMixedSplit(BVHBuild *builder, BVHSpatialStorage *storage, const BVHRange &range, - vector<BVHReference> *references, + vector<BVHReference> &references, int level, const BVHUnaligned *unaligned_heuristic = NULL, const Transform *aligned_space = NULL) @@ -197,7 +197,7 @@ class BVHMixedSplit { } else { bounds = unaligned_heuristic->compute_aligned_boundbox( - range, &references->at(0), *aligned_space); + range, &references.at(0), *aligned_space); } /* find split candidates. */ float area = bounds.safe_area(); @@ -220,7 +220,7 @@ class BVHMixedSplit { /* leaf SAH is the lowest => create leaf. */ minSAH = min(min(leafSAH, object.sah), spatial.sah); - no_split = (minSAH == leafSAH && builder->range_within_max_leaf_size(range, *references)); + no_split = (minSAH == leafSAH && builder->range_within_max_leaf_size(range, references)); } __forceinline void split(BVHBuild *builder, diff --git a/intern/cycles/bvh/bvh_unaligned.cpp b/intern/cycles/bvh/bvh_unaligned.cpp index f0995f343fe..c969b361643 100644 --- a/intern/cycles/bvh/bvh_unaligned.cpp +++ b/intern/cycles/bvh/bvh_unaligned.cpp @@ -68,7 +68,8 @@ bool BVHUnaligned::compute_aligned_space(const BVHReference &ref, Transform *ali const Object *object = objects_[ref.prim_object()]; const int packed_type = ref.prim_type(); const int type = (packed_type & PRIMITIVE_ALL); - if (type & PRIMITIVE_CURVE) { + /* No motion blur curves here, we can't fit them to aligned boxes well. */ + if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_CURVE_THICK)) { const int curve_index = ref.prim_index(); const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type); const Hair *hair = static_cast<const Hair *>(object->geometry); @@ -93,7 +94,8 @@ BoundBox BVHUnaligned::compute_aligned_prim_boundbox(const BVHReference &prim, const Object *object = objects_[prim.prim_object()]; const int packed_type = prim.prim_type(); const int type = (packed_type & PRIMITIVE_ALL); - if (type & PRIMITIVE_CURVE) { + /* No motion blur curves here, we can't fit them to aligned boxes well. */ + if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_CURVE_THICK)) { const int curve_index = prim.prim_index(); const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type); const Hair *hair = static_cast<const Hair *>(object->geometry); diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index aa5b65a2b73..ca366722eb7 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -99,6 +99,18 @@ if(WITH_CYCLES_DEVICE_MULTI) add_definitions(-DWITH_MULTI) endif() +if(WITH_OPENIMAGEDENOISE) + add_definitions(-DWITH_OPENIMAGEDENOISE) + add_definitions(-DOIDN_STATIC_LIB) + list(APPEND INC_SYS + ${OPENIMAGEDENOISE_INCLUDE_DIRS} + ) + list(APPEND LIB + ${OPENIMAGEDENOISE_LIBRARIES} + ${TBB_LIBRARIES} + ) +endif() + include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h index 1aa2fdd0967..e5e3e24165d 100644 --- a/intern/cycles/device/cuda/device_cuda.h +++ b/intern/cycles/device/cuda/device_cuda.h @@ -21,6 +21,7 @@ # include "device/device_split_kernel.h" # include "util/util_map.h" +# include "util/util_task.h" # ifdef WITH_CUDA_DYNLOAD # include "cuew.h" @@ -96,9 +97,9 @@ class CUDADevice : public Device { static bool have_precompiled_kernels(); - virtual bool show_samples() const; + virtual bool show_samples() const override; - virtual BVHLayoutMask get_bvh_layout_mask() const; + virtual BVHLayoutMask get_bvh_layout_mask() const override; void set_error(const string &error) override; @@ -108,7 +109,7 @@ class CUDADevice : public Device { bool support_device(const DeviceRequestedFeatures & /*requested_features*/); - bool check_peer_access(Device *peer_device); + bool check_peer_access(Device *peer_device) override; bool use_adaptive_compilation(); @@ -122,7 +123,7 @@ class CUDADevice : public Device { const char *base = "cuda", bool force_ptx = false); - virtual bool load_kernels(const DeviceRequestedFeatures &requested_features); + virtual bool load_kernels(const DeviceRequestedFeatures &requested_features) override; void load_functions(); @@ -140,19 +141,19 @@ class CUDADevice : public Device { void generic_free(device_memory &mem); - void mem_alloc(device_memory &mem); + void mem_alloc(device_memory &mem) override; - void mem_copy_to(device_memory &mem); + void mem_copy_to(device_memory &mem) override; - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem); + void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override; - void mem_zero(device_memory &mem); + void mem_zero(device_memory &mem) override; - void mem_free(device_memory &mem); + void mem_free(device_memory &mem) override; - device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/); + device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override; - virtual void const_copy_to(const char *name, void *host, size_t size); + virtual void const_copy_to(const char *name, void *host, size_t size) override; void global_alloc(device_memory &mem); @@ -252,15 +253,15 @@ class CUDADevice : public Device { int dw, int dh, bool transparent, - const DeviceDrawParams &draw_params); + const DeviceDrawParams &draw_params) override; - void thread_run(DeviceTask *task); + void thread_run(DeviceTask &task); - virtual void task_add(DeviceTask &task); + virtual void task_add(DeviceTask &task) override; - virtual void task_wait(); + virtual void task_wait() override; - virtual void task_cancel(); + virtual void task_cancel() override; }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp index 7aa63ff48c3..b9bbeb9a25b 100644 --- a/intern/cycles/device/cuda/device_cuda_impl.cpp +++ b/intern/cycles/device/cuda/device_cuda_impl.cpp @@ -105,7 +105,7 @@ class CUDASplitKernel : public DeviceSplitKernel { virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, const DeviceRequestedFeatures &); virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task); + virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task); }; /* Utility to push/pop CUDA context. */ @@ -243,7 +243,7 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool CUDADevice::~CUDADevice() { - task_pool.stop(); + task_pool.cancel(); delete split_kernel; @@ -2326,11 +2326,11 @@ void CUDADevice::draw_pixels(device_memory &mem, Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params); } -void CUDADevice::thread_run(DeviceTask *task) +void CUDADevice::thread_run(DeviceTask &task) { CUDAContextScope scope(this); - if (task->type == DeviceTask::RENDER) { + if (task.type == DeviceTask::RENDER) { DeviceRequestedFeatures requested_features; if (use_split_kernel()) { if (split_kernel == NULL) { @@ -2343,72 +2343,64 @@ void CUDADevice::thread_run(DeviceTask *task) /* keep rendering tiles until done */ RenderTile tile; - DenoisingTask denoising(this, *task); + DenoisingTask denoising(this, task); - while (task->acquire_tile(this, tile, task->tile_types)) { + while (task.acquire_tile(this, tile, task.tile_types)) { if (tile.task == RenderTile::PATH_TRACE) { if (use_split_kernel()) { device_only_memory<uchar> void_buffer(this, "void_buffer"); split_kernel->path_trace(task, tile, void_buffer, void_buffer); } else { - render(*task, tile, work_tiles); + render(task, tile, work_tiles); } } else if (tile.task == RenderTile::BAKE) { - render(*task, tile, work_tiles); + render(task, tile, work_tiles); } else if (tile.task == RenderTile::DENOISE) { tile.sample = tile.start_sample + tile.num_samples; denoise(tile, denoising); - task->update_progress(&tile, tile.w * tile.h); + task.update_progress(&tile, tile.w * tile.h); } - task->release_tile(tile); + task.release_tile(tile); - if (task->get_cancel()) { - if (task->need_finish_queue == false) + if (task.get_cancel()) { + if (task.need_finish_queue == false) break; } } work_tiles.free(); } - else if (task->type == DeviceTask::SHADER) { - shader(*task); + else if (task.type == DeviceTask::SHADER) { + shader(task); cuda_assert(cuCtxSynchronize()); } - else if (task->type == DeviceTask::DENOISE_BUFFER) { + else if (task.type == DeviceTask::DENOISE_BUFFER) { RenderTile tile; - tile.x = task->x; - tile.y = task->y; - tile.w = task->w; - tile.h = task->h; - tile.buffer = task->buffer; - tile.sample = task->sample + task->num_samples; - tile.num_samples = task->num_samples; - tile.start_sample = task->sample; - tile.offset = task->offset; - tile.stride = task->stride; - tile.buffers = task->buffers; - - DenoisingTask denoising(this, *task); + tile.x = task.x; + tile.y = task.y; + tile.w = task.w; + tile.h = task.h; + tile.buffer = task.buffer; + tile.sample = task.sample + task.num_samples; + tile.num_samples = task.num_samples; + tile.start_sample = task.sample; + tile.offset = task.offset; + tile.stride = task.stride; + tile.buffers = task.buffers; + + DenoisingTask denoising(this, task); denoise(tile, denoising); - task->update_progress(&tile, tile.w * tile.h); + task.update_progress(&tile, tile.w * tile.h); } } -class CUDADeviceTask : public DeviceTask { - public: - CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task) - { - run = function_bind(&CUDADevice::thread_run, device, this); - } -}; - void CUDADevice::task_add(DeviceTask &task) { CUDAContextScope scope(this); @@ -2424,7 +2416,10 @@ void CUDADevice::task_add(DeviceTask &task) film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); } else { - task_pool.push(new CUDADeviceTask(this, task)); + task_pool.push([=] { + DeviceTask task_copy = task; + thread_run(task_copy); + }); } } @@ -2652,7 +2647,7 @@ int2 CUDASplitKernel::split_kernel_local_size() int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg, device_memory &data, - DeviceTask * /*task*/) + DeviceTask & /*task*/) { CUDAContextScope scope(device); size_t free; diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 41dd7894d93..9dbb33980b4 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -77,7 +77,7 @@ std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &reques /* Device */ -Device::~Device() +Device::~Device() noexcept(false) { if (!background) { if (vertex_buffer != 0) { @@ -603,6 +603,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, info.has_osl = true; info.has_profiling = true; info.has_peer_memory = false; + info.denoisers = DENOISER_ALL; foreach (const DeviceInfo &device, subdevices) { /* Ensure CPU device does not slow down GPU. */ @@ -647,6 +648,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, info.has_osl &= device.has_osl; info.has_profiling &= device.has_profiling; info.has_peer_memory |= device.has_peer_memory; + info.denoisers &= device.denoisers; } return info; @@ -667,4 +669,55 @@ void Device::free_memory() network_devices.free_memory(); } +/* DeviceInfo */ + +void DeviceInfo::add_denoising_devices(DenoiserType denoiser_type) +{ + assert(denoising_devices.empty()); + + if (denoiser_type == DENOISER_OPTIX && type != DEVICE_OPTIX) { + vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX); + if (!optix_devices.empty()) { + /* Convert to a special multi device with separate denoising devices. */ + if (multi_devices.empty()) { + multi_devices.push_back(*this); + } + + /* Try to use the same physical devices for denoising. */ + for (const DeviceInfo &cuda_device : multi_devices) { + if (cuda_device.type == DEVICE_CUDA) { + for (const DeviceInfo &optix_device : optix_devices) { + if (cuda_device.num == optix_device.num) { + id += optix_device.id; + denoising_devices.push_back(optix_device); + break; + } + } + } + } + + if (denoising_devices.empty()) { + /* Simply use the first available OptiX device. */ + const DeviceInfo optix_device = optix_devices.front(); + id += optix_device.id; /* Uniquely identify this special multi device. */ + denoising_devices.push_back(optix_device); + } + + denoisers = denoiser_type; + } + } + else if (denoiser_type == DENOISER_OPENIMAGEDENOISE && type != DEVICE_CPU) { + /* Convert to a special multi device with separate denoising devices. */ + if (multi_devices.empty()) { + multi_devices.push_back(*this); + } + + /* Add CPU denoising devices. */ + DeviceInfo cpu_device = Device::available_devices(DEVICE_MASK_CPU).front(); + denoising_devices.push_back(cpu_device); + + denoisers = denoiser_type; + } +} + CCL_NAMESPACE_END diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index dff981080a5..a5833369a17 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -83,6 +83,7 @@ class DeviceInfo { bool use_split_kernel; /* Use split or mega kernel. */ bool has_profiling; /* Supports runtime collection of profiling info. */ bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */ + DenoiserTypeMask denoisers; /* Supported denoiser types. */ int cpu_threads; vector<DeviceInfo> multi_devices; vector<DeviceInfo> denoising_devices; @@ -101,6 +102,7 @@ class DeviceInfo { use_split_kernel = false; has_profiling = false; has_peer_memory = false; + denoisers = DENOISER_NONE; } bool operator==(const DeviceInfo &info) @@ -110,6 +112,9 @@ class DeviceInfo { (type == info.type && num == info.num && description == info.description)); return id == info.id; } + + /* Add additional devices needed for the specified denoiser. */ + void add_denoising_devices(DenoiserType denoiser_type); }; class DeviceRequestedFeatures { @@ -132,6 +137,7 @@ class DeviceRequestedFeatures { /* BVH/sampling kernel features. */ bool use_hair; + bool use_hair_thick; bool use_object_motion; bool use_camera_motion; @@ -178,6 +184,7 @@ class DeviceRequestedFeatures { max_nodes_group = 0; nodes_features = 0; use_hair = false; + use_hair_thick = false; use_object_motion = false; use_camera_motion = false; use_baking = false; @@ -200,6 +207,7 @@ class DeviceRequestedFeatures { max_nodes_group == requested_features.max_nodes_group && nodes_features == requested_features.nodes_features && use_hair == requested_features.use_hair && + use_hair_thick == requested_features.use_hair_thick && use_object_motion == requested_features.use_object_motion && use_camera_motion == requested_features.use_camera_motion && use_baking == requested_features.use_baking && @@ -319,7 +327,8 @@ class Device { virtual void mem_free_sub_ptr(device_ptr /*ptr*/){}; public: - virtual ~Device(); + /* noexcept needed to silence TBB warning. */ + virtual ~Device() noexcept(false); /* info */ DeviceInfo info; diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index fc6febd8cee..8f68e66a1b4 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -51,10 +51,12 @@ #include "util/util_function.h" #include "util/util_logging.h" #include "util/util_map.h" +#include "util/util_openimagedenoise.h" #include "util/util_opengl.h" #include "util/util_optimization.h" #include "util/util_progress.h" #include "util/util_system.h" +#include "util/util_task.h" #include "util/util_thread.h" CCL_NAMESPACE_BEGIN @@ -161,7 +163,7 @@ class CPUSplitKernel : public DeviceSplitKernel { virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, const DeviceRequestedFeatures &); virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task); + virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task); virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads); }; @@ -176,6 +178,10 @@ class CPUDevice : public Device { #ifdef WITH_OSL OSLGlobals osl_globals; #endif +#ifdef WITH_OPENIMAGEDENOISE + oidn::DeviceRef oidn_device; + oidn::FilterRef oidn_filter; +#endif bool use_split_kernel; @@ -332,7 +338,7 @@ class CPUDevice : public Device { ~CPUDevice() { - task_pool.stop(); + task_pool.cancel(); texture_info.free(); } @@ -344,17 +350,6 @@ class CPUDevice : public Device { virtual BVHLayoutMask get_bvh_layout_mask() const { BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2; - if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { - bvh_layout_mask |= BVH_LAYOUT_BVH4; - } - /* MSVC does not support the -march=native switch and you always end up */ - /* with an sse2 kernel when you use WITH_KERNEL_NATIVE. We *cannot* feed */ - /* that kernel BVH8 even if the CPU flags would allow for it. */ -#if (defined(__x86_64__) || defined(_M_X64)) && !(defined(_MSC_VER) && defined(WITH_KERNEL_NATIVE)) - if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { - bvh_layout_mask |= BVH_LAYOUT_BVH8; - } -#endif #ifdef WITH_EMBREE bvh_layout_mask |= BVH_LAYOUT_EMBREE; #endif /* WITH_EMBREE */ @@ -527,26 +522,18 @@ class CPUDevice : public Device { #endif } - void thread_run(DeviceTask *task) + void thread_run(DeviceTask &task) { - if (task->type == DeviceTask::RENDER) - thread_render(*task); - else if (task->type == DeviceTask::SHADER) - thread_shader(*task); - else if (task->type == DeviceTask::FILM_CONVERT) - thread_film_convert(*task); - else if (task->type == DeviceTask::DENOISE_BUFFER) - thread_denoise(*task); + if (task.type == DeviceTask::RENDER) + thread_render(task); + else if (task.type == DeviceTask::SHADER) + thread_shader(task); + else if (task.type == DeviceTask::FILM_CONVERT) + thread_film_convert(task); + else if (task.type == DeviceTask::DENOISE_BUFFER) + thread_denoise(task); } - class CPUDeviceTask : public DeviceTask { - public: - CPUDeviceTask(CPUDevice *device, DeviceTask &task) : DeviceTask(task) - { - run = function_bind(&CPUDevice::thread_run, device, this); - } - }; - bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, @@ -961,7 +948,71 @@ class CPUDevice : public Device { } } - void denoise(DenoisingTask &denoising, RenderTile &tile) + void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile) + { +#ifdef WITH_OPENIMAGEDENOISE + assert(openimagedenoise_supported()); + + /* Only one at a time, since OpenImageDenoise itself is multithreaded. */ + static thread_mutex mutex; + thread_scoped_lock lock(mutex); + + /* Create device and filter, cached for reuse. */ + if (!oidn_device) { + oidn_device = oidn::newDevice(); + oidn_device.commit(); + } + if (!oidn_filter) { + oidn_filter = oidn_device.newFilter("RT"); + } + + /* Copy pixels from compute device to CPU (no-op for CPU device). */ + rtile.buffers->buffer.copy_from_device(); + + /* Set images with appropriate stride for our interleaved pass storage. */ + const struct { + const char *name; + int offset; + } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR}, + {"normal", task.pass_denoising_data + DENOISING_PASS_NORMAL}, + {"albedo", task.pass_denoising_data + DENOISING_PASS_ALBEDO}, + {"output", 0}, + { NULL, + 0 }}; + + for (int i = 0; passes[i].name; i++) { + const int64_t offset = rtile.offset + rtile.x + rtile.y * rtile.stride; + const int64_t buffer_offset = (offset * task.pass_stride + passes[i].offset) * sizeof(float); + const int64_t pixel_stride = task.pass_stride * sizeof(float); + const int64_t row_stride = rtile.stride * pixel_stride; + + oidn_filter.setImage(passes[i].name, + (char *)rtile.buffer + buffer_offset, + oidn::Format::Float3, + rtile.w, + rtile.h, + 0, + pixel_stride, + row_stride); + } + + /* Execute filter. */ + oidn_filter.set("hdr", true); + oidn_filter.set("srgb", false); + oidn_filter.commit(); + oidn_filter.execute(); + + /* todo: it may be possible to avoid this copy, but we have to ensure that + * when other code copies data from the device it doesn't overwrite the + * denoiser buffers. */ + rtile.buffers->buffer.copy_to_device(); +#else + (void)task; + (void)rtile; +#endif + } + + void denoise_nlm(DenoisingTask &denoising, RenderTile &tile) { ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING); @@ -1019,15 +1070,14 @@ class CPUDevice : public Device { } } - RenderTile tile; - DenoisingTask denoising(this, task); - denoising.profiler = &kg->profiler; + DenoisingTask *denoising = NULL; + RenderTile tile; while (task.acquire_tile(this, tile, task.tile_types)) { if (tile.task == RenderTile::PATH_TRACE) { if (use_split_kernel) { device_only_memory<uchar> void_buffer(this, "void_buffer"); - split_kernel->path_trace(&task, tile, kgbuffer, void_buffer); + split_kernel->path_trace(task, tile, kgbuffer, void_buffer); } else { render(task, tile, kg); @@ -1037,7 +1087,16 @@ class CPUDevice : public Device { render(task, tile, kg); } else if (tile.task == RenderTile::DENOISE) { - denoise(denoising, tile); + if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) { + denoise_openimagedenoise(task, tile); + } + else if (task.denoising.type == DENOISER_NLM) { + if (denoising == NULL) { + denoising = new DenoisingTask(this, task); + denoising->profiler = &kg->profiler; + } + denoise_nlm(*denoising, tile); + } task.update_progress(&tile, tile.w * tile.h); } @@ -1055,6 +1114,7 @@ class CPUDevice : public Device { kg->~KernelGlobals(); kgbuffer.free(); delete split_kernel; + delete denoising; } void thread_denoise(DeviceTask &task) @@ -1072,16 +1132,22 @@ class CPUDevice : public Device { tile.stride = task.stride; tile.buffers = task.buffers; - DenoisingTask denoising(this, task); + if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) { + denoise_openimagedenoise(task, tile); + } + else { + DenoisingTask denoising(this, task); - ProfilingState denoising_profiler_state; - profiler.add_state(&denoising_profiler_state); - denoising.profiler = &denoising_profiler_state; + ProfilingState denoising_profiler_state; + profiler.add_state(&denoising_profiler_state); + denoising.profiler = &denoising_profiler_state; - denoise(denoising, tile); - task.update_progress(&tile, tile.w * tile.h); + denoise_nlm(denoising, tile); + + profiler.remove_state(&denoising_profiler_state); + } - profiler.remove_state(&denoising_profiler_state); + task.update_progress(&tile, tile.w * tile.h); } void thread_film_convert(DeviceTask &task) @@ -1155,13 +1221,24 @@ class CPUDevice : public Device { /* split task into smaller ones */ list<DeviceTask> tasks; - if (task.type == DeviceTask::SHADER) + if (task.type == DeviceTask::DENOISE_BUFFER && + task.denoising.type == DENOISER_OPENIMAGEDENOISE) { + /* Denoise entire buffer at once with OIDN, it has own threading. */ + tasks.push_back(task); + } + else if (task.type == DeviceTask::SHADER) { task.split(tasks, info.cpu_threads, 256); - else + } + else { task.split(tasks, info.cpu_threads); + } - foreach (DeviceTask &task, tasks) - task_pool.push(new CPUDeviceTask(this, task)); + foreach (DeviceTask &task, tasks) { + task_pool.push([=] { + DeviceTask task_copy = task; + thread_run(task_copy); + }); + } } void task_wait() @@ -1326,7 +1403,7 @@ int2 CPUSplitKernel::split_kernel_local_size() int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/, device_memory & /*data*/, - DeviceTask * /*task*/) + DeviceTask & /*task*/) { return make_int2(1, 1); } @@ -1358,6 +1435,10 @@ void device_cpu_info(vector<DeviceInfo> &devices) info.has_osl = true; info.has_half_images = true; info.has_profiling = true; + info.denoisers = DENOISER_NLM; + if (openimagedenoise_supported()) { + info.denoisers |= DENOISER_OPENIMAGEDENOISE; + } devices.insert(devices.begin(), info); } diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 04c04761311..d9ffcceb06e 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -130,6 +130,7 @@ void device_cuda_info(vector<DeviceInfo> &devices) info.has_half_images = (major >= 3); info.has_volume_decoupled = false; info.has_adaptive_stop_per_sample = false; + info.denoisers = DENOISER_NLM; /* Check if the device has P2P access to any other device in the system. */ for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) { diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index ac17c02a427..89de80a5bcd 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -56,8 +56,8 @@ DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) tile_info->frames[i] = task.denoising_frames[i - 1]; } - write_passes = task.denoising_write_passes; - do_filter = task.denoising_do_filter; + do_prefilter = task.denoising.store_passes && task.denoising.type == DENOISER_NLM; + do_filter = task.denoising.use && task.denoising.type == DENOISER_NLM; } DenoisingTask::~DenoisingTask() @@ -91,7 +91,7 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles) target_buffer.stride = rtiles[9].stride; target_buffer.ptr = rtiles[9].buffer; - if (write_passes && rtiles[9].buffers) { + if (do_prefilter && rtiles[9].buffers) { target_buffer.denoising_output_offset = rtiles[9].buffers->params.get_denoising_prefiltered_offset(); } @@ -111,7 +111,7 @@ void DenoisingTask::setup_denoising_buffer() rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3])); - buffer.use_intensity = write_passes || (tile_info->num_frames > 1); + buffer.use_intensity = do_prefilter || (tile_info->num_frames > 1); buffer.passes = buffer.use_intensity ? 15 : 14; buffer.width = rect.z - rect.x; buffer.stride = align_up(buffer.width, 4); @@ -343,7 +343,7 @@ void DenoisingTask::run_denoising(RenderTile *tile) reconstruct(); } - if (write_passes) { + if (do_prefilter) { write_buffer(); } diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index bd1d0193dbd..4c122e981eb 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -60,7 +60,7 @@ class DenoisingTask { int4 rect; int4 filter_area; - bool write_passes; + bool do_prefilter; bool do_filter; struct DeviceFunctions { diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 020b9e10e60..fd14bbdccc5 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -396,8 +396,8 @@ class MultiDevice : public Device { size_t existing_size = mem.device_size; /* This is a hack to only allocate the tile buffers on denoising devices - * Similarily the tile buffers also need to be allocated separately on all devices so any - * overlap rendered for denoising does not interfer with each other */ + * Similarly the tile buffers also need to be allocated separately on all devices so any + * overlap rendered for denoising does not interfere with each other */ if (strcmp(mem.name, "RenderBuffers") == 0) { vector<device_ptr> device_pointers; device_pointers.reserve(devices.size()); diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp index 0933d51f321..8904b517e92 100644 --- a/intern/cycles/device/device_network.cpp +++ b/intern/cycles/device/device_network.cpp @@ -313,6 +313,7 @@ void device_network_info(vector<DeviceInfo> &devices) info.has_volume_decoupled = false; info.has_adaptive_stop_per_sample = false; info.has_osl = false; + info.denoisers = DENOISER_NONE; devices.push_back(info); } diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index 8a0b128697f..39b9ef70192 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -120,6 +120,7 @@ void device_opencl_info(vector<DeviceInfo> &devices) info.use_split_kernel = true; info.has_volume_decoupled = false; info.has_adaptive_stop_per_sample = false; + info.denoisers = DENOISER_NLM; info.id = id; /* Check OpenCL extensions */ diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp index fbf6a914744..ececca3df53 100644 --- a/intern/cycles/device/device_optix.cpp +++ b/intern/cycles/device/device_optix.cpp @@ -246,7 +246,7 @@ class OptiXDevice : public CUDADevice { ~OptiXDevice() { // Stop processing any more tasks - task_pool.stop(); + task_pool.cancel(); // Make CUDA context current const CUDAContextScope scope(cuContext); @@ -428,11 +428,20 @@ class OptiXDevice : public CUDADevice { group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit"; if (requested_features.use_hair) { - // Add curve intersection programs group_descs[PG_HITD].hitgroup.moduleIS = optix_module; - group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve"; group_descs[PG_HITS].hitgroup.moduleIS = optix_module; - group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve"; + + // Add curve intersection programs + if (requested_features.use_hair_thick) { + // Slower programs for thick hair since that also slows down ribbons. + // Ideally this should not be needed. + group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all"; + group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all"; + } + else { + group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; + group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; + } } if (requested_features.use_subsurface || requested_features.use_shader_raytrace) { @@ -712,7 +721,7 @@ class OptiXDevice : public CUDADevice { const CUDAContextScope scope(cuContext); // Choose between OptiX and NLM denoising - if (task.denoising_use_optix) { + if (task.denoising.type == DENOISER_OPTIX) { // Map neighboring tiles onto this device, indices are as following: // Where index 4 is the center tile and index 9 is the target for the result. // 0 1 2 @@ -1436,21 +1445,21 @@ class OptiXDevice : public CUDADevice { KernelData *const data = (KernelData *)host; *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle; - update_launch_params(name, offsetof(KernelParams, data), host, size); + update_launch_params(offsetof(KernelParams, data), host, size); return; } // Update data storage pointers in launch parameters # define KERNEL_TEX(data_type, tex_name) \ if (strcmp(name, #tex_name) == 0) { \ - update_launch_params(name, offsetof(KernelParams, tex_name), host, size); \ + update_launch_params(offsetof(KernelParams, tex_name), host, size); \ return; \ } # include "kernel/kernel_textures.h" # undef KERNEL_TEX } - void update_launch_params(const char *name, size_t offset, void *data, size_t data_size) + void update_launch_params(size_t offset, void *data, size_t data_size) { const CUDAContextScope scope(cuContext); @@ -1463,15 +1472,6 @@ class OptiXDevice : public CUDADevice { void task_add(DeviceTask &task) override { - struct OptiXDeviceTask : public DeviceTask { - OptiXDeviceTask(OptiXDevice *device, DeviceTask &task, int task_index) : DeviceTask(task) - { - // Using task index parameter instead of thread index, since number of CUDA streams may - // differ from number of threads - run = function_bind(&OptiXDevice::thread_run, device, *this, task_index); - } - }; - // Upload texture information to device if it has changed since last launch load_texture_info(); @@ -1483,7 +1483,10 @@ class OptiXDevice : public CUDADevice { if (task.type == DeviceTask::DENOISE_BUFFER) { // Execute denoising in a single thread (e.g. to avoid race conditions during creation) - task_pool.push(new OptiXDeviceTask(this, task, 0)); + task_pool.push([=] { + DeviceTask task_copy = task; + thread_run(task_copy, 0); + }); return; } @@ -1493,8 +1496,15 @@ class OptiXDevice : public CUDADevice { // Queue tasks in internal task pool int task_index = 0; - for (DeviceTask &task : tasks) - task_pool.push(new OptiXDeviceTask(this, task, task_index++)); + for (DeviceTask &task : tasks) { + task_pool.push([=] { + // Using task index parameter instead of thread index, since number of CUDA streams may + // differ from number of threads + DeviceTask task_copy = task; + thread_run(task_copy, task_index); + }); + task_index++; + } } void task_wait() override @@ -1551,6 +1561,7 @@ void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo info.type = DEVICE_OPTIX; info.id += "_OptiX"; + info.denoisers |= DENOISER_OPTIX; devices.push_back(info); } diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp index f22d8761058..4c288f60c16 100644 --- a/intern/cycles/device/device_split_kernel.cpp +++ b/intern/cycles/device/device_split_kernel.cpp @@ -145,7 +145,7 @@ size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg, return max_buffer_size / size_per_element; } -bool DeviceSplitKernel::path_trace(DeviceTask *task, +bool DeviceSplitKernel::path_trace(DeviceTask &task, RenderTile &tile, device_memory &kgbuffer, device_memory &kernel_data) @@ -222,9 +222,9 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, subtile.start_sample = tile.sample; subtile.num_samples = samples_per_second; - if (task->adaptive_sampling.use) { - subtile.num_samples = task->adaptive_sampling.align_dynamic_samples(subtile.start_sample, - subtile.num_samples); + if (task.adaptive_sampling.use) { + subtile.num_samples = task.adaptive_sampling.align_dynamic_samples(subtile.start_sample, + subtile.num_samples); } /* Don't go beyond requested number of samples. */ @@ -286,7 +286,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size); - if (task->get_cancel() && cancel_time == DBL_MAX) { + if (task.get_cancel() && cancel_time == DBL_MAX) { /* Wait up to twice as many seconds for current samples to finish * to avoid artifacts in render result from ending too soon. */ @@ -323,7 +323,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, } int filter_sample = tile.sample + subtile.num_samples - 1; - if (task->adaptive_sampling.use && task->adaptive_sampling.need_filter(filter_sample)) { + if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { size_t buffer_size[2]; buffer_size[0] = round_up(tile.w, local_size[0]); buffer_size[1] = round_up(tile.h, local_size[1]); @@ -352,16 +352,16 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, #undef ENQUEUE_SPLIT_KERNEL tile.sample += subtile.num_samples; - task->update_progress(&tile, tile.w * tile.h * subtile.num_samples); + task.update_progress(&tile, tile.w * tile.h * subtile.num_samples); time_multiplier = min(time_multiplier << 1, 10); - if (task->get_cancel()) { + if (task.get_cancel()) { return true; } } - if (task->adaptive_sampling.use) { + if (task.adaptive_sampling.use) { /* Reset the start samples. */ RenderTile subtile = tile; subtile.start_sample = tile.start_sample; diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h index 9d6b9efdd62..07a21b10299 100644 --- a/intern/cycles/device/device_split_kernel.h +++ b/intern/cycles/device/device_split_kernel.h @@ -109,7 +109,7 @@ class DeviceSplitKernel { virtual ~DeviceSplitKernel(); bool load_kernels(const DeviceRequestedFeatures &requested_features); - bool path_trace(DeviceTask *task, + bool path_trace(DeviceTask &task, RenderTile &rtile, device_memory &kgbuffer, device_memory &kernel_data); @@ -137,7 +137,7 @@ class DeviceSplitKernel { virtual int2 split_kernel_local_size() = 0; virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, - DeviceTask *task) = 0; + DeviceTask &task) = 0; }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index 7485e1b41de..6e7c184c6c9 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -50,7 +50,7 @@ DeviceTask::DeviceTask(Type type_) last_update_time = time_dt(); } -int DeviceTask::get_subtask_count(int num, int max_size) +int DeviceTask::get_subtask_count(int num, int max_size) const { if (max_size != 0) { int max_size_num; @@ -78,7 +78,7 @@ int DeviceTask::get_subtask_count(int num, int max_size) return num; } -void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) +void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) const { num = get_subtask_count(num, max_size); diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 8c4e682adb1..600973b8100 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -21,7 +21,6 @@ #include "util/util_function.h" #include "util/util_list.h" -#include "util/util_task.h" CCL_NAMESPACE_BEGIN @@ -32,8 +31,33 @@ class RenderBuffers; class RenderTile; class Tile; +enum DenoiserType { + DENOISER_NLM = 1, + DENOISER_OPTIX = 2, + DENOISER_OPENIMAGEDENOISE = 4, + DENOISER_NUM, + + DENOISER_NONE = 0, + DENOISER_ALL = ~0, +}; + +typedef int DenoiserTypeMask; + class DenoiseParams { public: + /* Apply denoiser to image. */ + bool use; + /* Output denoising data passes (possibly without applying the denoiser). */ + bool store_passes; + + /* Denoiser type. */ + DenoiserType type; + + /* Viewport start sample. */ + int start_sample; + + /** Native Denoiser **/ + /* Pixel radius for neighboring pixels to take into account. */ int radius; /* Controls neighbor pixel weighting for the denoising filter. */ @@ -47,18 +71,36 @@ class DenoiseParams { int neighbor_frames; /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */ bool clamp_input; + + /** Optix Denoiser **/ + /* Passes handed over to the OptiX denoiser (default to color + albedo). */ int optix_input_passes; DenoiseParams() { + use = false; + store_passes = false; + + type = DENOISER_NLM; + radius = 8; strength = 0.5f; feature_strength = 0.5f; relative_pca = false; neighbor_frames = 2; clamp_input = true; + optix_input_passes = 2; + + start_sample = 0; + } + + /* Test if a denoising task needs to run, also to prefilter passes for the native + * denoiser when we are not applying denoising to the combined image. */ + bool need_denoising_task() const + { + return (use || (store_passes && type == DENOISER_NLM)); } }; @@ -75,7 +117,7 @@ class AdaptiveSampling { int min_samples; }; -class DeviceTask : public Task { +class DeviceTask { public: typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type; Type type; @@ -98,8 +140,8 @@ class DeviceTask : public Task { explicit DeviceTask(Type type = RENDER); - int get_subtask_count(int num, int max_size = 0); - void split(list<DeviceTask> &tasks, int num, int max_size = 0); + int get_subtask_count(int num, int max_size = 0) const; + void split(list<DeviceTask> &tasks, int num, int max_size = 0) const; void update_progress(RenderTile *rtile, int pixel_samples = -1); @@ -116,10 +158,6 @@ class DeviceTask : public Task { bool denoising_from_render; vector<int> denoising_frames; - bool denoising_do_filter; - bool denoising_use_optix; - bool denoising_write_passes; - int pass_stride; int frame_stride; int target_pass_stride; diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h index 389268e1c2a..e0140996cf0 100644 --- a/intern/cycles/device/opencl/device_opencl.h +++ b/intern/cycles/device/opencl/device_opencl.h @@ -23,6 +23,7 @@ # include "util/util_map.h" # include "util/util_param.h" # include "util/util_string.h" +# include "util/util_task.h" # include "clew.h" @@ -258,6 +259,8 @@ class OpenCLDevice : public Device { TaskPool load_required_kernel_task_pool; /* Task pool for optional kernels (feature kernels during foreground rendering) */ TaskPool load_kernel_task_pool; + std::atomic<int> load_kernel_num_compiling; + cl_context cxContext; cl_command_queue cqCommandQueue; cl_platform_id cpPlatform; @@ -455,14 +458,6 @@ class OpenCLDevice : public Device { void denoise(RenderTile &tile, DenoisingTask &denoising); - class OpenCLDeviceTask : public DeviceTask { - public: - OpenCLDeviceTask(OpenCLDevice *device, DeviceTask &task) : DeviceTask(task) - { - run = function_bind(&OpenCLDevice::thread_run, device, this); - } - }; - int get_split_task_count(DeviceTask & /*task*/) { return 1; @@ -470,7 +465,10 @@ class OpenCLDevice : public Device { void task_add(DeviceTask &task) { - task_pool.push(new OpenCLDeviceTask(this, task)); + task_pool.push([=] { + DeviceTask task_copy = task; + thread_run(task_copy); + }); } void task_wait() @@ -483,7 +481,7 @@ class OpenCLDevice : public Device { task_pool.cancel(); } - void thread_run(DeviceTask *task); + void thread_run(DeviceTask &task); virtual BVHLayoutMask get_bvh_layout_mask() const { diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp index beb3174b111..8c94815b193 100644 --- a/intern/cycles/device/opencl/device_opencl_impl.cpp +++ b/intern/cycles/device/opencl/device_opencl_impl.cpp @@ -542,7 +542,7 @@ class OpenCLSplitKernel : public DeviceSplitKernel { virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, - DeviceTask * /*task*/) + DeviceTask & /*task*/) { cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); /* Use small global size on CPU devices as it seems to be much faster. */ @@ -610,6 +610,7 @@ void OpenCLDevice::opencl_assert_err(cl_int err, const char *where) OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) : Device(info, stats, profiler, background), + load_kernel_num_compiling(0), kernel_programs(this), preview_programs(this), memory_manager(this), @@ -684,9 +685,9 @@ OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, b OpenCLDevice::~OpenCLDevice() { - task_pool.stop(); - load_required_kernel_task_pool.stop(); - load_kernel_task_pool.stop(); + task_pool.cancel(); + load_required_kernel_task_pool.cancel(); + load_kernel_task_pool.cancel(); memory_manager.free(); @@ -798,7 +799,11 @@ bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_feature * internally within a single process. */ foreach (OpenCLProgram *program, programs) { if (!program->load()) { - load_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); + load_kernel_num_compiling++; + load_kernel_task_pool.push([=] { + program->compile(); + load_kernel_num_compiling--; + }); } } return true; @@ -868,7 +873,7 @@ bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requeste * Better to check on device level than per kernel as mixing preview and * non-preview kernels does not work due to different data types */ if (use_preview_kernels) { - use_preview_kernels = !load_kernel_task_pool.finished(); + use_preview_kernels = load_kernel_num_compiling.load() > 0; } } return split_kernel->load_kernels(requested_features); @@ -895,7 +900,7 @@ DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state() return DEVICE_KERNEL_USING_FEATURE_KERNEL; } - bool other_kernels_finished = load_kernel_task_pool.finished(); + bool other_kernels_finished = load_kernel_num_compiling.load() == 0; if (use_preview_kernels) { if (other_kernels_finished) { return DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE; @@ -1336,20 +1341,20 @@ void OpenCLDevice::flush_texture_buffers() memory_manager.alloc("texture_info", texture_info); } -void OpenCLDevice::thread_run(DeviceTask *task) +void OpenCLDevice::thread_run(DeviceTask &task) { flush_texture_buffers(); - if (task->type == DeviceTask::RENDER) { + if (task.type == DeviceTask::RENDER) { RenderTile tile; - DenoisingTask denoising(this, *task); + DenoisingTask denoising(this, task); /* Allocate buffer for kernel globals */ device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals"); kgbuffer.alloc_to_device(1); /* Keep rendering tiles until done. */ - while (task->acquire_tile(this, tile, task->tile_types)) { + while (task.acquire_tile(this, tile, task.tile_types)) { if (tile.task == RenderTile::PATH_TRACE) { assert(tile.task == RenderTile::PATH_TRACE); scoped_timer timer(&tile.buffers->render_time); @@ -1368,42 +1373,42 @@ void OpenCLDevice::thread_run(DeviceTask *task) clFinish(cqCommandQueue); } else if (tile.task == RenderTile::BAKE) { - bake(*task, tile); + bake(task, tile); } else if (tile.task == RenderTile::DENOISE) { tile.sample = tile.start_sample + tile.num_samples; denoise(tile, denoising); - task->update_progress(&tile, tile.w * tile.h); + task.update_progress(&tile, tile.w * tile.h); } - task->release_tile(tile); + task.release_tile(tile); } kgbuffer.free(); } - else if (task->type == DeviceTask::SHADER) { - shader(*task); + else if (task.type == DeviceTask::SHADER) { + shader(task); } - else if (task->type == DeviceTask::FILM_CONVERT) { - film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); + else if (task.type == DeviceTask::FILM_CONVERT) { + film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); } - else if (task->type == DeviceTask::DENOISE_BUFFER) { + else if (task.type == DeviceTask::DENOISE_BUFFER) { RenderTile tile; - tile.x = task->x; - tile.y = task->y; - tile.w = task->w; - tile.h = task->h; - tile.buffer = task->buffer; - tile.sample = task->sample + task->num_samples; - tile.num_samples = task->num_samples; - tile.start_sample = task->sample; - tile.offset = task->offset; - tile.stride = task->stride; - tile.buffers = task->buffers; - - DenoisingTask denoising(this, *task); + tile.x = task.x; + tile.y = task.y; + tile.w = task.w; + tile.h = task.h; + tile.buffer = task.buffer; + tile.sample = task.sample + task.num_samples; + tile.num_samples = task.num_samples; + tile.start_sample = task.sample; + tile.offset = task.offset; + tile.stride = task.stride; + tile.buffers = task.buffers; + + DenoisingTask denoising(this, task); denoise(tile, denoising); - task->update_progress(&tile, tile.w * tile.h); + task.update_progress(&tile, tile.w * tile.h); } } @@ -1937,10 +1942,8 @@ void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile) clFinish(cqCommandQueue); } -string OpenCLDevice::kernel_build_options(const string *debug_src) +static bool kernel_build_opencl_2(cl_device_id cdDevice) { - string build_options = "-cl-no-signed-zeros -cl-mad-enable "; - /* Build with OpenCL 2.0 if available, this improves performance * with AMD OpenCL drivers on Windows and Linux (legacy drivers). * Note that OpenCL selects the highest 1.x version by default, @@ -1948,10 +1951,36 @@ string OpenCLDevice::kernel_build_options(const string *debug_src) int version_major, version_minor; if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) { if (version_major >= 2) { - build_options += "-cl-std=CL2.0 "; + /* This appears to trigger a driver bug in Radeon RX cards with certain + * driver version, so don't use OpenCL 2.0 for those. */ + string device_name = OpenCLInfo::get_readable_device_name(cdDevice); + if (string_startswith(device_name, "Radeon RX 4") || + string_startswith(device_name, "Radeon (TM) RX 4") || + string_startswith(device_name, "Radeon RX 5") || + string_startswith(device_name, "Radeon (TM) RX 5")) { + char version[256] = ""; + int driver_major, driver_minor; + clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL); + if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) { + return !(driver_major == 3075 && driver_minor <= 12); + } + } + + return true; } } + return false; +} + +string OpenCLDevice::kernel_build_options(const string *debug_src) +{ + string build_options = "-cl-no-signed-zeros -cl-mad-enable "; + + if (kernel_build_opencl_2(cdDevice)) { + build_options += "-cl-std=CL2.0 "; + } + if (platform_name == "NVIDIA CUDA") { build_options += "-D__KERNEL_OPENCL_NVIDIA__ " diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 2e839a616e9..7cc0d32d521 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -81,18 +81,6 @@ set(SRC_BVH_HEADERS bvh/bvh_types.h bvh/bvh_volume.h bvh/bvh_volume_all.h - bvh/qbvh_nodes.h - bvh/qbvh_shadow_all.h - bvh/qbvh_local.h - bvh/qbvh_traversal.h - bvh/qbvh_volume.h - bvh/qbvh_volume_all.h - bvh/obvh_nodes.h - bvh/obvh_shadow_all.h - bvh/obvh_local.h - bvh/obvh_traversal.h - bvh/obvh_volume.h - bvh/obvh_volume_all.h bvh/bvh_embree.h ) @@ -113,6 +101,8 @@ set(SRC_HEADERS kernel_id_passes.h kernel_jitter.h kernel_light.h + kernel_light_background.h + kernel_light_common.h kernel_math.h kernel_montecarlo.h kernel_passes.h diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index 9b9df883b62..80b58f46329 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -35,14 +35,6 @@ CCL_NAMESPACE_BEGIN #ifndef __KERNEL_OPTIX__ -/* Common QBVH functions. */ -# ifdef __QBVH__ -# include "kernel/bvh/qbvh_nodes.h" -# ifdef __KERNEL_AVX2__ -# include "kernel/bvh/obvh_nodes.h" -# endif -# endif - /* Regular BVH traversal */ # include "kernel/bvh/bvh_nodes.h" @@ -51,27 +43,21 @@ CCL_NAMESPACE_BEGIN # define BVH_FUNCTION_FEATURES 0 # include "kernel/bvh/bvh_traversal.h" -# if defined(__INSTANCING__) -# define BVH_FUNCTION_NAME bvh_intersect_instancing -# define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "kernel/bvh/bvh_traversal.h" -# endif - # if defined(__HAIR__) # define BVH_FUNCTION_NAME bvh_intersect_hair -# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR +# define BVH_FUNCTION_FEATURES BVH_HAIR # include "kernel/bvh/bvh_traversal.h" # endif # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION +# define BVH_FUNCTION_FEATURES BVH_MOTION # include "kernel/bvh/bvh_traversal.h" # endif # if defined(__HAIR__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_hair_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR | BVH_MOTION +# define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_MOTION # include "kernel/bvh/bvh_traversal.h" # endif @@ -96,15 +82,9 @@ CCL_NAMESPACE_BEGIN # define BVH_FUNCTION_FEATURES BVH_HAIR # include "kernel/bvh/bvh_volume.h" -# if defined(__INSTANCING__) -# define BVH_FUNCTION_NAME bvh_intersect_volume_instancing -# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR -# include "kernel/bvh/bvh_volume.h" -# endif - # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION | BVH_HAIR +# define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR # include "kernel/bvh/bvh_volume.h" # endif # endif /* __VOLUME__ */ @@ -116,27 +96,21 @@ CCL_NAMESPACE_BEGIN # define BVH_FUNCTION_FEATURES 0 # include "kernel/bvh/bvh_shadow_all.h" -# if defined(__INSTANCING__) -# define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing -# define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "kernel/bvh/bvh_shadow_all.h" -# endif - # if defined(__HAIR__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair -# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR +# define BVH_FUNCTION_FEATURES BVH_HAIR # include "kernel/bvh/bvh_shadow_all.h" # endif # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION +# define BVH_FUNCTION_FEATURES BVH_MOTION # include "kernel/bvh/bvh_shadow_all.h" # endif # if defined(__HAIR__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR | BVH_MOTION +# define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_MOTION # include "kernel/bvh/bvh_shadow_all.h" # endif # endif /* __SHADOW_RECORD_ALL__ */ @@ -148,15 +122,9 @@ CCL_NAMESPACE_BEGIN # define BVH_FUNCTION_FEATURES BVH_HAIR # include "kernel/bvh/bvh_volume_all.h" -# if defined(__INSTANCING__) -# define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing -# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_HAIR -# include "kernel/bvh/bvh_volume_all.h" -# endif - # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion -# define BVH_FUNCTION_FEATURES BVH_INSTANCING | BVH_MOTION | BVH_HAIR +# define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR # include "kernel/bvh/bvh_volume_all.h" # endif # endif /* __VOLUME_RECORD_ALL__ */ @@ -264,21 +232,8 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg, } # endif /* __HAIR__ */ -# ifdef __KERNEL_CPU__ -# ifdef __INSTANCING__ - if (kernel_data.bvh.have_instancing) { - return bvh_intersect_instancing(kg, ray, isect, visibility); - } -# endif /* __INSTANCING__ */ - return bvh_intersect(kg, ray, isect, visibility); -# else /* __KERNEL_CPU__ */ -# ifdef __INSTANCING__ - return bvh_intersect_instancing(kg, ray, isect, visibility); -# else return bvh_intersect(kg, ray, isect, visibility); -# endif /* __INSTANCING__ */ -# endif /* __KERNEL_CPU__ */ -#endif /* __KERNEL_OPTIX__ */ +#endif /* __KERNEL_OPTIX__ */ } #ifdef __BVH_LOCAL__ @@ -476,21 +431,8 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, } # endif /* __HAIR__ */ -# ifdef __KERNEL_CPU__ -# ifdef __INSTANCING__ - if (kernel_data.bvh.have_instancing) { - return bvh_intersect_shadow_all_instancing(kg, ray, isect, visibility, max_hits, num_hits); - } -# endif /* __INSTANCING__ */ return bvh_intersect_shadow_all(kg, ray, isect, visibility, max_hits, num_hits); -# else -# ifdef __INSTANCING__ - return bvh_intersect_shadow_all_instancing(kg, ray, isect, visibility, max_hits, num_hits); -# else - return bvh_intersect_shadow_all(kg, ray, isect, visibility, max_hits, num_hits); -# endif /* __INSTANCING__ */ -# endif /* __KERNEL_CPU__ */ -# endif /* __KERNEL_OPTIX__ */ +# endif /* __KERNEL_OPTIX__ */ } #endif /* __SHADOW_RECORD_ALL__ */ @@ -548,21 +490,8 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg, } # endif /* __OBJECT_MOTION__ */ -# ifdef __KERNEL_CPU__ -# ifdef __INSTANCING__ - if (kernel_data.bvh.have_instancing) { - return bvh_intersect_volume_instancing(kg, ray, isect, visibility); - } -# endif /* __INSTANCING__ */ return bvh_intersect_volume(kg, ray, isect, visibility); -# else /* __KERNEL_CPU__ */ -# ifdef __INSTANCING__ - return bvh_intersect_volume_instancing(kg, ray, isect, visibility); -# else - return bvh_intersect_volume(kg, ray, isect, visibility); -# endif /* __INSTANCING__ */ -# endif /* __KERNEL_CPU__ */ -# endif /* __KERNEL_OPTIX__ */ +# endif /* __KERNEL_OPTIX__ */ } #endif /* __VOLUME__ */ @@ -599,11 +528,6 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg, } # endif /* __OBJECT_MOTION__ */ -# ifdef __INSTANCING__ - if (kernel_data.bvh.have_instancing) { - return bvh_intersect_volume_all_instancing(kg, ray, isect, max_hits, visibility); - } -# endif /* __INSTANCING__ */ return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility); } #endif /* __VOLUME_RECORD_ALL__ */ diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h index 7a069ef1108..4006c9c1632 100644 --- a/intern/cycles/kernel/bvh/bvh_local.h +++ b/intern/cycles/kernel/bvh/bvh_local.h @@ -17,13 +17,6 @@ * limitations under the License. */ -#ifdef __QBVH__ -# include "kernel/bvh/qbvh_local.h" -# ifdef __KERNEL_AVX2__ -# include "kernel/bvh/obvh_local.h" -# endif -#endif - #if BVH_FEATURE(BVH_HAIR) # define NODE_INTERSECT bvh_node_intersect #else @@ -88,26 +81,6 @@ ccl_device_inline object = local_object; } -#if defined(__KERNEL_SSE2__) - const shuffle_swap_t shuf_identity = shuffle_swap_identity(); - const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - - const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); - ssef Psplat[3], idirsplat[3]; -# if BVH_FEATURE(BVH_HAIR) - ssef tnear(0.0f), tfar(isect_t); -# endif - shuffle_swap_t shufflexyz[3]; - - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); - - ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t); - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -#endif - /* traversal loop */ do { do { @@ -117,33 +90,16 @@ ccl_device_inline float dist[2]; float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); -#if !defined(__KERNEL_SSE2__) traverse_mask = NODE_INTERSECT(kg, P, -# if BVH_FEATURE(BVH_HAIR) +#if BVH_FEATURE(BVH_HAIR) dir, -# endif +#endif idir, isect_t, node_addr, PATH_RAY_ALL_VISIBILITY, dist); -#else // __KERNEL_SSE2__ - traverse_mask = NODE_INTERSECT(kg, - P, - dir, -# if BVH_FEATURE(BVH_HAIR) - tnear, - tfar, -# endif - tsplat, - Psplat, - idirsplat, - shufflexyz, - node_addr, - PATH_RAY_ALL_VISIBILITY, - dist); -#endif // __KERNEL_SSE2__ node_addr = __float_as_int(cnodes.z); node_addr_child1 = __float_as_int(cnodes.w); @@ -247,20 +203,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, uint *lcg_state, int max_hits) { - switch (kernel_data.bvh.bvh_layout) { -#ifdef __KERNEL_AVX2__ - case BVH_LAYOUT_BVH8: - return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, local_isect, local_object, lcg_state, max_hits); -#endif -#ifdef __QBVH__ - case BVH_LAYOUT_BVH4: - return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, local_isect, local_object, lcg_state, max_hits); -#endif - case BVH_LAYOUT_BVH2: - return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, local_isect, local_object, lcg_state, max_hits); - } - kernel_assert(!"Should not happen"); - return false; + return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, local_isect, local_object, lcg_state, max_hits); } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h index db598d1c7fa..5367bdb633c 100644 --- a/intern/cycles/kernel/bvh/bvh_nodes.h +++ b/intern/cycles/kernel/bvh/bvh_nodes.h @@ -28,7 +28,6 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k return space; } -#if !defined(__KERNEL_SSE2__) ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, const float3 P, const float3 idir, @@ -39,9 +38,9 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, { /* fetch node data */ -# ifdef __VISIBILITY_FLAG__ +#ifdef __VISIBILITY_FLAG__ float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); -# endif +#endif float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr + 1); float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr + 2); float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr + 3); @@ -68,13 +67,13 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, dist[0] = c0min; dist[1] = c1min; -# ifdef __VISIBILITY_FLAG__ +#ifdef __VISIBILITY_FLAG__ /* this visibility test gives a 5% performance hit, how to solve? */ return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) | (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0); -# else +#else return ((c0max >= c0min) ? 1 : 0) | ((c1max >= c1min) ? 2 : 0); -# endif +#endif } ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg, @@ -113,21 +112,21 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, float dist[2]) { int mask = 0; -# ifdef __VISIBILITY_FLAG__ +#ifdef __VISIBILITY_FLAG__ float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); -# endif +#endif if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) { -# ifdef __VISIBILITY_FLAG__ +#ifdef __VISIBILITY_FLAG__ if ((__float_as_uint(cnodes.x) & visibility)) -# endif +#endif { mask |= 1; } } if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) { -# ifdef __VISIBILITY_FLAG__ +#ifdef __VISIBILITY_FLAG__ if ((__float_as_uint(cnodes.y) & visibility)) -# endif +#endif { mask |= 2; } @@ -152,125 +151,3 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, return bvh_aligned_node_intersect(kg, P, idir, t, node_addr, visibility, dist); } } - -#else /* !defined(__KERNEL_SSE2__) */ - -int ccl_device_forceinline bvh_aligned_node_intersect(KernelGlobals *kg, - const float3 &P, - const float3 &dir, - const ssef &tsplat, - const ssef Psplat[3], - const ssef idirsplat[3], - const shuffle_swap_t shufflexyz[3], - const int node_addr, - const uint visibility, - float dist[2]) -{ - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); - - /* fetch node data */ - const ssef *bvh_nodes = (ssef *)kg->__bvh_nodes.data + node_addr; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - dist[0] = tminmax[0]; - dist[1] = tminmax[1]; - - int mask = movemask(lrhit); - -# ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); - int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) | - (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0); - return cmask; -# else - return mask & 3; -# endif -} - -ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, - const float3 P, - const float3 dir, - const ssef &isect_near, - const ssef &isect_far, - const int node_addr, - const uint visibility, - float dist[2]) -{ - Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); - Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); - - float3 aligned_dir0 = transform_direction(&space0, dir), - aligned_dir1 = transform_direction(&space1, dir); - float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P); - float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), - nrdir1 = -bvh_inverse_direction(aligned_dir1); - - ssef lower_x = ssef(aligned_P0.x * nrdir0.x, aligned_P1.x * nrdir1.x, 0.0f, 0.0f), - lower_y = ssef(aligned_P0.y * nrdir0.y, aligned_P1.y * nrdir1.y, 0.0f, 0.0f), - lower_z = ssef(aligned_P0.z * nrdir0.z, aligned_P1.z * nrdir1.z, 0.0f, 0.0f); - - ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f), - upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f), - upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f); - - ssef tnear_x = min(lower_x, upper_x); - ssef tnear_y = min(lower_y, upper_y); - ssef tnear_z = min(lower_z, upper_z); - ssef tfar_x = max(lower_x, upper_x); - ssef tfar_y = max(lower_y, upper_y); - ssef tfar_z = max(lower_z, upper_z); - - const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - sseb vmask = tnear <= tfar; - dist[0] = tnear.f[0]; - dist[1] = tnear.f[1]; - - int mask = (int)movemask(vmask); - -# ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); - int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) | - (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0); - return cmask; -# else - return mask & 3; -# endif -} - -ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, - const float3 &P, - const float3 &dir, - const ssef &isect_near, - const ssef &isect_far, - const ssef &tsplat, - const ssef Psplat[3], - const ssef idirsplat[3], - const shuffle_swap_t shufflexyz[3], - const int node_addr, - const uint visibility, - float dist[2]) -{ - float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); - if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return bvh_unaligned_node_intersect( - kg, P, dir, isect_near, isect_far, node_addr, visibility, dist); - } - else { - return bvh_aligned_node_intersect( - kg, P, dir, tsplat, Psplat, idirsplat, shufflexyz, node_addr, visibility, dist); - } -} -#endif /* !defined(__KERNEL_SSE2__) */ diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index 268bb149970..dccd257d2de 100644 --- a/intern/cycles/kernel/bvh/bvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -17,13 +17,6 @@ * limitations under the License. */ -#ifdef __QBVH__ -# include "kernel/bvh/qbvh_shadow_all.h" -# ifdef __KERNEL_AVX2__ -# include "kernel/bvh/obvh_shadow_all.h" -# endif -#endif - #if BVH_FEATURE(BVH_HAIR) # define NODE_INTERSECT bvh_node_intersect #else @@ -34,7 +27,6 @@ * enabled/disabled. This way we can compile optimized versions for each case * without new features slowing things down. * - * BVH_INSTANCING: object instancing * BVH_HAIR: hair curve rendering * BVH_MOTION: motion blur rendering */ @@ -76,33 +68,11 @@ ccl_device_inline Transform ob_itfm; #endif -#if BVH_FEATURE(BVH_INSTANCING) int num_hits_in_instance = 0; -#endif *num_hits = 0; isect_array->t = tmax; -#if defined(__KERNEL_SSE2__) - const shuffle_swap_t shuf_identity = shuffle_swap_identity(); - const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - - const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); - ssef Psplat[3], idirsplat[3]; -# if BVH_FEATURE(BVH_HAIR) - ssef tnear(0.0f), tfar(isect_t); -# endif - shuffle_swap_t shufflexyz[3]; - - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); - - ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t); - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -#endif /* __KERNEL_SSE2__ */ - /* traversal loop */ do { do { @@ -112,33 +82,16 @@ ccl_device_inline float dist[2]; float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); -#if !defined(__KERNEL_SSE2__) traverse_mask = NODE_INTERSECT(kg, P, -# if BVH_FEATURE(BVH_HAIR) +#if BVH_FEATURE(BVH_HAIR) dir, -# endif +#endif idir, isect_t, node_addr, visibility, dist); -#else // __KERNEL_SSE2__ - traverse_mask = NODE_INTERSECT(kg, - P, - dir, -# if BVH_FEATURE(BVH_HAIR) - tnear, - tfar, -# endif - tsplat, - Psplat, - idirsplat, - shufflexyz, - node_addr, - visibility, - dist); -#endif // __KERNEL_SSE2__ node_addr = __float_as_int(cnodes.z); node_addr_child1 = __float_as_int(cnodes.w); @@ -174,9 +127,7 @@ ccl_device_inline float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); int prim_addr = __float_as_int(leaf.x); -#if BVH_FEATURE(BVH_INSTANCING) if (prim_addr >= 0) { -#endif const int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); const uint p_type = type & PRIMITIVE_ALL; @@ -207,31 +158,13 @@ ccl_device_inline } #endif #if BVH_FEATURE(BVH_HAIR) - case PRIMITIVE_CURVE: - case PRIMITIVE_MOTION_CURVE: { + case PRIMITIVE_CURVE_THICK: + case PRIMITIVE_MOTION_CURVE_THICK: + case PRIMITIVE_CURVE_RIBBON: + case PRIMITIVE_MOTION_CURVE_RIBBON: { const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); - if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = cardinal_curve_intersect(kg, - isect_array, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type); - } - else { - hit = curve_intersect(kg, - isect_array, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type); - } + hit = curve_intersect( + kg, isect_array, P, dir, visibility, object, prim_addr, ray->time, curve_type); break; } #endif @@ -276,9 +209,7 @@ ccl_device_inline /* move on to next entry in intersections array */ isect_array++; (*num_hits)++; -#if BVH_FEATURE(BVH_INSTANCING) num_hits_in_instance++; -#endif isect_array->t = isect_t; } @@ -286,32 +217,19 @@ ccl_device_inline prim_addr++; } } -#if BVH_FEATURE(BVH_INSTANCING) else { /* instance push */ object = kernel_tex_fetch(__prim_object, -prim_addr - 1); -# if BVH_FEATURE(BVH_MOTION) +#if BVH_FEATURE(BVH_MOTION) isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); -# else +#else isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); -# endif +#endif num_hits_in_instance = 0; isect_array->t = isect_t; -# if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); - - tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); -# if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect_t); -# endif - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -# endif - ++stack_ptr; kernel_assert(stack_ptr < BVH_STACK_SIZE); traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; @@ -319,10 +237,8 @@ ccl_device_inline node_addr = kernel_tex_fetch(__object_node, object); } } -#endif /* FEATURE(BVH_INSTANCING) */ } while (node_addr != ENTRYPOINT_SENTINEL); -#if BVH_FEATURE(BVH_INSTANCING) if (stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); @@ -330,11 +246,11 @@ ccl_device_inline if (num_hits_in_instance) { float t_fac; -# if BVH_FEATURE(BVH_MOTION) +#if BVH_FEATURE(BVH_MOTION) bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); -# else +#else bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); -# endif +#endif /* scale isect->t to adjust for instancing */ for (int i = 0; i < num_hits_in_instance; i++) { @@ -342,33 +258,20 @@ ccl_device_inline } } else { -# if BVH_FEATURE(BVH_MOTION) +#if BVH_FEATURE(BVH_MOTION) bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); -# else +#else bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); -# endif +#endif } isect_t = tmax; isect_array->t = isect_t; -# if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); - - tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); -# if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect_t); -# endif - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -# endif - object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr]; --stack_ptr; } -#endif /* FEATURE(BVH_INSTANCING) */ } while (node_addr != ENTRYPOINT_SENTINEL); return false; @@ -381,20 +284,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, const uint max_hits, uint *num_hits) { - switch (kernel_data.bvh.bvh_layout) { -#ifdef __KERNEL_AVX2__ - case BVH_LAYOUT_BVH8: - return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, isect_array, visibility, max_hits, num_hits); -#endif -#ifdef __QBVH__ - case BVH_LAYOUT_BVH4: - return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect_array, visibility, max_hits, num_hits); -#endif - case BVH_LAYOUT_BVH2: - return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, visibility, max_hits, num_hits); - } - kernel_assert(!"Should not happen"); - return false; + return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, visibility, max_hits, num_hits); } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index 18afc6ae4eb..8b2699ab807 100644 --- a/intern/cycles/kernel/bvh/bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -17,13 +17,6 @@ * limitations under the License. */ -#ifdef __QBVH__ -# include "kernel/bvh/qbvh_traversal.h" -#endif -#ifdef __KERNEL_AVX2__ -# include "kernel/bvh/obvh_traversal.h" -#endif - #if BVH_FEATURE(BVH_HAIR) # define NODE_INTERSECT bvh_node_intersect #else @@ -34,7 +27,6 @@ * enabled/disabled. This way we can compile optimized versions for each case * without new features slowing things down. * - * BVH_INSTANCING: object instancing * BVH_HAIR: hair curve rendering * BVH_MOTION: motion blur rendering */ @@ -77,26 +69,6 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, BVH_DEBUG_INIT(); -#if defined(__KERNEL_SSE2__) - const shuffle_swap_t shuf_identity = shuffle_swap_identity(); - const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - - const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); - ssef Psplat[3], idirsplat[3]; -# if BVH_FEATURE(BVH_HAIR) - ssef tnear(0.0f), tfar(isect->t); -# endif - shuffle_swap_t shufflexyz[3]; - - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); - - ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t); - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -#endif - /* traversal loop */ do { do { @@ -106,37 +78,18 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, float dist[2]; float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); -#if !defined(__KERNEL_SSE2__) { traverse_mask = NODE_INTERSECT(kg, P, -# if BVH_FEATURE(BVH_HAIR) +#if BVH_FEATURE(BVH_HAIR) dir, -# endif +#endif idir, isect->t, node_addr, visibility, dist); } -#else // __KERNEL_SSE2__ - { - traverse_mask = NODE_INTERSECT(kg, - P, - dir, -# if BVH_FEATURE(BVH_HAIR) - tnear, - tfar, -# endif - tsplat, - Psplat, - idirsplat, - shufflexyz, - node_addr, - visibility, - dist); - } -#endif // __KERNEL_SSE2__ node_addr = __float_as_int(cnodes.z); node_addr_child1 = __float_as_int(cnodes.w); @@ -173,9 +126,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); int prim_addr = __float_as_int(leaf.x); -#if BVH_FEATURE(BVH_INSTANCING) if (prim_addr >= 0) { -#endif const int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); @@ -191,17 +142,8 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) { /* shadow ray early termination */ -#if defined(__KERNEL_SSE2__) if (visibility & PATH_RAY_SHADOW_OPAQUE) return true; - tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); -# if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect->t); -# endif -#else - if (visibility & PATH_RAY_SHADOW_OPAQUE) - return true; -#endif } } break; @@ -214,51 +156,28 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, if (motion_triangle_intersect( kg, isect, P, dir, ray->time, visibility, object, prim_addr)) { /* shadow ray early termination */ -# if defined(__KERNEL_SSE2__) - if (visibility & PATH_RAY_SHADOW_OPAQUE) - return true; - tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); -# if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect->t); -# endif -# else if (visibility & PATH_RAY_SHADOW_OPAQUE) return true; -# endif } } break; } #endif /* BVH_FEATURE(BVH_MOTION) */ #if BVH_FEATURE(BVH_HAIR) - case PRIMITIVE_CURVE: - case PRIMITIVE_MOTION_CURVE: { + case PRIMITIVE_CURVE_THICK: + case PRIMITIVE_MOTION_CURVE_THICK: + case PRIMITIVE_CURVE_RIBBON: + case PRIMITIVE_MOTION_CURVE_RIBBON: { for (; prim_addr < prim_addr2; prim_addr++) { BVH_DEBUG_NEXT_INTERSECTION(); const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); - bool hit; - if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = cardinal_curve_intersect( - kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type); - } - else { - hit = curve_intersect( - kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type); - } + const bool hit = curve_intersect( + kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type); if (hit) { /* shadow ray early termination */ -# if defined(__KERNEL_SSE2__) if (visibility & PATH_RAY_SHADOW_OPAQUE) return true; - tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); -# if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect->t); -# endif -# else - if (visibility & PATH_RAY_SHADOW_OPAQUE) - return true; -# endif } } break; @@ -266,30 +185,16 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, #endif /* BVH_FEATURE(BVH_HAIR) */ } } -#if BVH_FEATURE(BVH_INSTANCING) else { /* instance push */ object = kernel_tex_fetch(__prim_object, -prim_addr - 1); -# if BVH_FEATURE(BVH_MOTION) +#if BVH_FEATURE(BVH_MOTION) isect->t = bvh_instance_motion_push( kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); -# else +#else isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); -# endif - -# if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); - - tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); -# if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect->t); -# endif - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -# endif +#endif ++stack_ptr; kernel_assert(stack_ptr < BVH_STACK_SIZE); @@ -300,38 +205,22 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, BVH_DEBUG_NEXT_INSTANCE(); } } -#endif /* FEATURE(BVH_INSTANCING) */ } while (node_addr != ENTRYPOINT_SENTINEL); -#if BVH_FEATURE(BVH_INSTANCING) if (stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); /* instance pop */ -# if BVH_FEATURE(BVH_MOTION) +#if BVH_FEATURE(BVH_MOTION) isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); -# else +#else isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); -# endif - -# if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); - - tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); -# if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect->t); -# endif - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -# endif +#endif object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr]; --stack_ptr; } -#endif /* FEATURE(BVH_INSTANCING) */ } while (node_addr != ENTRYPOINT_SENTINEL); return (isect->prim != PRIM_NONE); @@ -342,20 +231,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, Intersection *isect, const uint visibility) { - switch (kernel_data.bvh.bvh_layout) { -#ifdef __KERNEL_AVX2__ - case BVH_LAYOUT_BVH8: - return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, isect, visibility); -#endif -#ifdef __QBVH__ - case BVH_LAYOUT_BVH4: - return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect, visibility); -#endif /* __QBVH__ */ - case BVH_LAYOUT_BVH2: - return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect, visibility); - } - kernel_assert(!"Should not happen"); - return false; + return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect, visibility); } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h index 84dc0dbaef5..b173568266b 100644 --- a/intern/cycles/kernel/bvh/bvh_types.h +++ b/intern/cycles/kernel/bvh/bvh_types.h @@ -31,13 +31,10 @@ CCL_NAMESPACE_BEGIN /* 64 object BVH + 64 mesh BVH + 64 object node splitting */ #define BVH_STACK_SIZE 192 -#define BVH_QSTACK_SIZE 384 -#define BVH_OSTACK_SIZE 768 /* BVH intersection function variations */ -#define BVH_INSTANCING 1 -#define BVH_MOTION 2 -#define BVH_HAIR 4 +#define BVH_MOTION 1 +#define BVH_HAIR 2 #define BVH_NAME_JOIN(x, y) x##_##y #define BVH_NAME_EVAL(x, y) BVH_NAME_JOIN(x, y) diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h index c83b0d783f4..1f2ea47269b 100644 --- a/intern/cycles/kernel/bvh/bvh_volume.h +++ b/intern/cycles/kernel/bvh/bvh_volume.h @@ -17,13 +17,6 @@ * limitations under the License. */ -#ifdef __QBVH__ -# include "kernel/bvh/qbvh_volume.h" -# ifdef __KERNEL_AVX2__ -# include "kernel/bvh/obvh_volume.h" -# endif -#endif - #if BVH_FEATURE(BVH_HAIR) # define NODE_INTERSECT bvh_node_intersect #else @@ -34,7 +27,6 @@ * various features can be enabled/disabled. This way we can compile optimized * versions for each case without new features slowing things down. * - * BVH_INSTANCING: object instancing * BVH_MOTION: motion blur rendering */ @@ -79,26 +71,6 @@ ccl_device_inline isect->prim = PRIM_NONE; isect->object = OBJECT_NONE; -#if defined(__KERNEL_SSE2__) - const shuffle_swap_t shuf_identity = shuffle_swap_identity(); - const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - - const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); - ssef Psplat[3], idirsplat[3]; -# if BVH_FEATURE(BVH_HAIR) - ssef tnear(0.0f), tfar(isect->t); -# endif - shuffle_swap_t shufflexyz[3]; - - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); - - ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t); - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -#endif - /* traversal loop */ do { do { @@ -108,33 +80,16 @@ ccl_device_inline float dist[2]; float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); -#if !defined(__KERNEL_SSE2__) traverse_mask = NODE_INTERSECT(kg, P, -# if BVH_FEATURE(BVH_HAIR) +#if BVH_FEATURE(BVH_HAIR) dir, -# endif +#endif idir, isect->t, node_addr, visibility, dist); -#else // __KERNEL_SSE2__ - traverse_mask = NODE_INTERSECT(kg, - P, - dir, -# if BVH_FEATURE(BVH_HAIR) - tnear, - tfar, -# endif - tsplat, - Psplat, - idirsplat, - shufflexyz, - node_addr, - visibility, - dist); -#endif // __KERNEL_SSE2__ node_addr = __float_as_int(cnodes.z); node_addr_child1 = __float_as_int(cnodes.w); @@ -170,9 +125,7 @@ ccl_device_inline float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); int prim_addr = __float_as_int(leaf.x); -#if BVH_FEATURE(BVH_INSTANCING) if (prim_addr >= 0) { -#endif const int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); @@ -222,31 +175,17 @@ ccl_device_inline } } } -#if BVH_FEATURE(BVH_INSTANCING) else { /* instance push */ object = kernel_tex_fetch(__prim_object, -prim_addr - 1); int object_flag = kernel_tex_fetch(__object_flag, object); if (object_flag & SD_OBJECT_HAS_VOLUME) { -# if BVH_FEATURE(BVH_MOTION) +#if BVH_FEATURE(BVH_MOTION) isect->t = bvh_instance_motion_push( kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); -# else +#else isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); -# endif - -# if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); - - tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); -# if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect->t); -# endif - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -# endif +#endif ++stack_ptr; kernel_assert(stack_ptr < BVH_STACK_SIZE); @@ -262,38 +201,22 @@ ccl_device_inline } } } -#endif /* FEATURE(BVH_INSTANCING) */ } while (node_addr != ENTRYPOINT_SENTINEL); -#if BVH_FEATURE(BVH_INSTANCING) if (stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); /* instance pop */ -# if BVH_FEATURE(BVH_MOTION) +#if BVH_FEATURE(BVH_MOTION) isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); -# else +#else isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); -# endif - -# if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); - - tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); -# if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect->t); -# endif - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -# endif +#endif object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr]; --stack_ptr; } -#endif /* FEATURE(BVH_MOTION) */ } while (node_addr != ENTRYPOINT_SENTINEL); return (isect->prim != PRIM_NONE); @@ -304,20 +227,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, Intersection *isect, const uint visibility) { - switch (kernel_data.bvh.bvh_layout) { -#ifdef __KERNEL_AVX2__ - case BVH_LAYOUT_BVH8: - return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, isect, visibility); -#endif -#ifdef __QBVH__ - case BVH_LAYOUT_BVH4: - return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect, visibility); -#endif - case BVH_LAYOUT_BVH2: - return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect, visibility); - } - kernel_assert(!"Should not happen"); - return false; + return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect, visibility); } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h index ae8c4d12e8a..a8664cc4331 100644 --- a/intern/cycles/kernel/bvh/bvh_volume_all.h +++ b/intern/cycles/kernel/bvh/bvh_volume_all.h @@ -17,13 +17,6 @@ * limitations under the License. */ -#ifdef __QBVH__ -# include "kernel/bvh/qbvh_volume_all.h" -# ifdef __KERNEL_AVX2__ -# include "kernel/bvh/obvh_volume_all.h" -# endif -#endif - #if BVH_FEATURE(BVH_HAIR) # define NODE_INTERSECT bvh_node_intersect #else @@ -34,7 +27,6 @@ * various features can be enabled/disabled. This way we can compile optimized * versions for each case without new features slowing things down. * - * BVH_INSTANCING: object instancing * BVH_MOTION: motion blur rendering */ @@ -76,33 +68,11 @@ ccl_device_inline Transform ob_itfm; #endif -#if BVH_FEATURE(BVH_INSTANCING) int num_hits_in_instance = 0; -#endif uint num_hits = 0; isect_array->t = tmax; -#if defined(__KERNEL_SSE2__) - const shuffle_swap_t shuf_identity = shuffle_swap_identity(); - const shuffle_swap_t shuf_swap = shuffle_swap_swap(); - - const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); - ssef Psplat[3], idirsplat[3]; -# if BVH_FEATURE(BVH_HAIR) - ssef tnear(0.0f), tfar(isect_t); -# endif - shuffle_swap_t shufflexyz[3]; - - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); - - ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t); - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -#endif /* __KERNEL_SSE2__ */ - /* traversal loop */ do { do { @@ -112,33 +82,16 @@ ccl_device_inline float dist[2]; float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); -#if !defined(__KERNEL_SSE2__) traverse_mask = NODE_INTERSECT(kg, P, -# if BVH_FEATURE(BVH_HAIR) +#if BVH_FEATURE(BVH_HAIR) dir, -# endif +#endif idir, isect_t, node_addr, visibility, dist); -#else // __KERNEL_SSE2__ - traverse_mask = NODE_INTERSECT(kg, - P, - dir, -# if BVH_FEATURE(BVH_HAIR) - tnear, - tfar, -# endif - tsplat, - Psplat, - idirsplat, - shufflexyz, - node_addr, - visibility, - dist); -#endif // __KERNEL_SSE2__ node_addr = __float_as_int(cnodes.z); node_addr_child1 = __float_as_int(cnodes.w); @@ -174,9 +127,7 @@ ccl_device_inline float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); int prim_addr = __float_as_int(leaf.x); -#if BVH_FEATURE(BVH_INSTANCING) if (prim_addr >= 0) { -#endif const int prim_addr2 = __float_as_int(leaf.y); const uint type = __float_as_int(leaf.w); bool hit; @@ -204,25 +155,21 @@ ccl_device_inline /* Move on to next entry in intersections array. */ isect_array++; num_hits++; -#if BVH_FEATURE(BVH_INSTANCING) num_hits_in_instance++; -#endif isect_array->t = isect_t; if (num_hits == max_hits) { -#if BVH_FEATURE(BVH_INSTANCING) if (object != OBJECT_NONE) { -# if BVH_FEATURE(BVH_MOTION) +#if BVH_FEATURE(BVH_MOTION) float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); -# else +#else Transform itfm = object_fetch_transform( kg, object, OBJECT_INVERSE_TRANSFORM); float t_fac = 1.0f / len(transform_direction(&itfm, dir)); -# endif +#endif for (int i = 0; i < num_hits_in_instance; i++) { (isect_array - i - 1)->t *= t_fac; } } -#endif /* BVH_FEATURE(BVH_INSTANCING) */ return num_hits; } } @@ -248,25 +195,21 @@ ccl_device_inline /* Move on to next entry in intersections array. */ isect_array++; num_hits++; -# if BVH_FEATURE(BVH_INSTANCING) num_hits_in_instance++; -# endif isect_array->t = isect_t; if (num_hits == max_hits) { -# if BVH_FEATURE(BVH_INSTANCING) if (object != OBJECT_NONE) { -# if BVH_FEATURE(BVH_MOTION) +# if BVH_FEATURE(BVH_MOTION) float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); -# else +# else Transform itfm = object_fetch_transform( kg, object, OBJECT_INVERSE_TRANSFORM); float t_fac = 1.0f / len(transform_direction(&itfm, dir)); -# endif +# endif for (int i = 0; i < num_hits_in_instance; i++) { (isect_array - i - 1)->t *= t_fac; } } -# endif /* BVH_FEATURE(BVH_INSTANCING) */ return num_hits; } } @@ -279,35 +222,21 @@ ccl_device_inline } } } -#if BVH_FEATURE(BVH_INSTANCING) else { /* instance push */ object = kernel_tex_fetch(__prim_object, -prim_addr - 1); int object_flag = kernel_tex_fetch(__object_flag, object); if (object_flag & SD_OBJECT_HAS_VOLUME) { -# if BVH_FEATURE(BVH_MOTION) +#if BVH_FEATURE(BVH_MOTION) isect_t = bvh_instance_motion_push( kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); -# else +#else isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); -# endif +#endif num_hits_in_instance = 0; isect_array->t = isect_t; -# if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); - - tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); -# if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect_t); -# endif - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -# endif - ++stack_ptr; kernel_assert(stack_ptr < BVH_STACK_SIZE); traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL; @@ -322,55 +251,39 @@ ccl_device_inline } } } -#endif /* FEATURE(BVH_INSTANCING) */ } while (node_addr != ENTRYPOINT_SENTINEL); -#if BVH_FEATURE(BVH_INSTANCING) if (stack_ptr >= 0) { kernel_assert(object != OBJECT_NONE); /* Instance pop. */ if (num_hits_in_instance) { float t_fac; -# if BVH_FEATURE(BVH_MOTION) +#if BVH_FEATURE(BVH_MOTION) bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); -# else +#else bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); -# endif +#endif /* Scale isect->t to adjust for instancing. */ for (int i = 0; i < num_hits_in_instance; i++) { (isect_array - i - 1)->t *= t_fac; } } else { -# if BVH_FEATURE(BVH_MOTION) +#if BVH_FEATURE(BVH_MOTION) bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); -# else +#else bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); -# endif +#endif } isect_t = tmax; isect_array->t = isect_t; -# if defined(__KERNEL_SSE2__) - Psplat[0] = ssef(P.x); - Psplat[1] = ssef(P.y); - Psplat[2] = ssef(P.z); - - tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t); -# if BVH_FEATURE(BVH_HAIR) - tfar = ssef(isect_t); -# endif - - gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); -# endif - object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr]; --stack_ptr; } -#endif /* FEATURE(BVH_INSTANCING) */ } while (node_addr != ENTRYPOINT_SENTINEL); return num_hits; @@ -382,20 +295,7 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg, const uint max_hits, const uint visibility) { - switch (kernel_data.bvh.bvh_layout) { -#ifdef __KERNEL_AVX2__ - case BVH_LAYOUT_BVH8: - return BVH_FUNCTION_FULL_NAME(OBVH)(kg, ray, isect_array, max_hits, visibility); -#endif -#ifdef __QBVH__ - case BVH_LAYOUT_BVH4: - return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect_array, max_hits, visibility); -#endif - case BVH_LAYOUT_BVH2: - return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, max_hits, visibility); - } - kernel_assert(!"Should not happen"); - return 0; + return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, max_hits, visibility); } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/obvh_local.h b/intern/cycles/kernel/bvh/obvh_local.h deleted file mode 100644 index e6bb548bc5b..00000000000 --- a/intern/cycles/kernel/bvh/obvh_local.h +++ /dev/null @@ -1,398 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* This is a template BVH traversal function for subsurface scattering, where - * various features can be enabled/disabled. This way we can compile optimized - * versions for each case without new features slowing things down. - * - * BVH_MOTION: motion blur rendering - */ - -#if BVH_FEATURE(BVH_HAIR) -# define NODE_INTERSECT obvh_node_intersect -#else -# define NODE_INTERSECT obvh_aligned_node_intersect -#endif - -ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, - const Ray *ray, - LocalIntersection *local_isect, - int local_object, - uint *lcg_state, - int max_hits) -{ - /* Traversal stack in CUDA thread-local memory. */ - OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_tex_fetch(__object_node, local_object); - - /* Ray parameters in registers. */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - float isect_t = ray->t; - - if (local_isect != NULL) { - local_isect->num_hits = 0; - } - kernel_assert((local_isect == NULL) == (max_hits == 0)); - - const int object_flag = kernel_tex_fetch(__object_flag, local_object); - if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { -#if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; - isect_t = bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, isect_t, &ob_itfm); -#else - isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t); -#endif - object = local_object; - } - - avxf tnear(0.0f), tfar(isect_t); -#if BVH_FEATURE(BVH_HAIR) - avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); -#endif - avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); - -#ifdef __KERNEL_AVX2__ - float3 P_idir = P * idir; - avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); -#endif - - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - avxf dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, -#ifdef __KERNEL_AVX2__ - P_idir4, -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, -#endif -#if BVH_FEATURE(BVH_HAIR) - dir4, -#endif - idir4, - near_x, - near_y, - near_z, - far_x, - far_y, - far_z, - node_addr, - &dist); - - if (child_mask != 0) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); - avxf cnodes; -#if BVH_FEATURE(BVH_HAIR) - if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26); - } - else -#endif - { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if (child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float *)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float *)&dist)[r]; - if (child_mask == 0) { - if (d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - - /* Five children are hit, push all onto stack and sort 5 - * stack items, continue with closest child - */ - r = __bscf(child_mask); - int c4 = __float_as_int(cnodes[r]); - float d4 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - /* Six children are hit, push all onto stack and sort 6 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c5 = __float_as_int(cnodes[r]); - float d5 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - - /* Seven children are hit, push all onto stack and sort 7 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c6 = __float_as_int(cnodes[r]); - float d6 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - /* Eight children are hit, push all onto stack and sort 8 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c7 = __float_as_int(cnodes[r]); - float d7 = ((float *)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c7; - traversal_stack[stack_ptr].dist = d7; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6], - &traversal_stack[stack_ptr - 7]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if (node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); - int prim_addr = __float_as_int(leaf.x); - - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - - /* Primitive intersection. */ - switch (type & PRIMITIVE_ALL) { - case PRIMITIVE_TRIANGLE: { - /* Intersect ray against primitive, */ - for (; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if (triangle_intersect_local(kg, - local_isect, - P, - dir, - object, - local_object, - prim_addr, - isect_t, - lcg_state, - max_hits)) { - return true; - } - } - break; - } -#if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - /* Intersect ray against primitive. */ - for (; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if (motion_triangle_intersect_local(kg, - local_isect, - P, - dir, - ray->time, - object, - local_object, - prim_addr, - isect_t, - lcg_state, - max_hits)) { - return true; - } - } - break; - } -#endif - default: - break; - } - } - } while (node_addr != ENTRYPOINT_SENTINEL); - } while (node_addr != ENTRYPOINT_SENTINEL); - return false; -} - -#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/obvh_nodes.h b/intern/cycles/kernel/bvh/obvh_nodes.h deleted file mode 100644 index e5c935b75ed..00000000000 --- a/intern/cycles/kernel/bvh/obvh_nodes.h +++ /dev/null @@ -1,410 +0,0 @@ -/* - * Copyright 2011-2014, Blender Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * Aligned nodes intersection AVX code is adopted from Embree, - */ - -struct OBVHStackItem { - int addr; - float dist; -}; - -ccl_device_inline void obvh_near_far_idx_calc(const float3 &idir, - int *ccl_restrict near_x, - int *ccl_restrict near_y, - int *ccl_restrict near_z, - int *ccl_restrict far_x, - int *ccl_restrict far_y, - int *ccl_restrict far_z) - -{ -#ifdef __KERNEL_SSE__ - *near_x = 0; - *far_x = 1; - *near_y = 2; - *far_y = 3; - *near_z = 4; - *far_z = 5; - - const size_t mask = movemask(ssef(idir.m128)); - - const int mask_x = mask & 1; - const int mask_y = (mask & 2) >> 1; - const int mask_z = (mask & 4) >> 2; - - *near_x += mask_x; - *far_x -= mask_x; - *near_y += mask_y; - *far_y -= mask_y; - *near_z += mask_z; - *far_z -= mask_z; -#else - if (idir.x >= 0.0f) { - *near_x = 0; - *far_x = 1; - } - else { - *near_x = 1; - *far_x = 0; - } - if (idir.y >= 0.0f) { - *near_y = 2; - *far_y = 3; - } - else { - *near_y = 3; - *far_y = 2; - } - if (idir.z >= 0.0f) { - *near_z = 4; - *far_z = 5; - } - else { - *near_z = 5; - *far_z = 4; - } -#endif -} - -ccl_device_inline void obvh_item_swap(OBVHStackItem *ccl_restrict a, OBVHStackItem *ccl_restrict b) -{ - OBVHStackItem tmp = *a; - *a = *b; - *b = tmp; -} - -ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, - OBVHStackItem *ccl_restrict s2, - OBVHStackItem *ccl_restrict s3) -{ - if (s2->dist < s1->dist) { - obvh_item_swap(s2, s1); - } - if (s3->dist < s2->dist) { - obvh_item_swap(s3, s2); - } - if (s2->dist < s1->dist) { - obvh_item_swap(s2, s1); - } -} - -ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, - OBVHStackItem *ccl_restrict s2, - OBVHStackItem *ccl_restrict s3, - OBVHStackItem *ccl_restrict s4) -{ - if (s2->dist < s1->dist) { - obvh_item_swap(s2, s1); - } - if (s4->dist < s3->dist) { - obvh_item_swap(s4, s3); - } - if (s3->dist < s1->dist) { - obvh_item_swap(s3, s1); - } - if (s4->dist < s2->dist) { - obvh_item_swap(s4, s2); - } - if (s3->dist < s2->dist) { - obvh_item_swap(s3, s2); - } -} - -ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, - OBVHStackItem *ccl_restrict s2, - OBVHStackItem *ccl_restrict s3, - OBVHStackItem *ccl_restrict s4, - OBVHStackItem *ccl_restrict s5) -{ - obvh_stack_sort(s1, s2, s3, s4); - if (s5->dist < s4->dist) { - obvh_item_swap(s4, s5); - if (s4->dist < s3->dist) { - obvh_item_swap(s3, s4); - if (s3->dist < s2->dist) { - obvh_item_swap(s2, s3); - if (s2->dist < s1->dist) { - obvh_item_swap(s1, s2); - } - } - } - } -} - -ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, - OBVHStackItem *ccl_restrict s2, - OBVHStackItem *ccl_restrict s3, - OBVHStackItem *ccl_restrict s4, - OBVHStackItem *ccl_restrict s5, - OBVHStackItem *ccl_restrict s6) -{ - obvh_stack_sort(s1, s2, s3, s4, s5); - if (s6->dist < s5->dist) { - obvh_item_swap(s5, s6); - if (s5->dist < s4->dist) { - obvh_item_swap(s4, s5); - if (s4->dist < s3->dist) { - obvh_item_swap(s3, s4); - if (s3->dist < s2->dist) { - obvh_item_swap(s2, s3); - if (s2->dist < s1->dist) { - obvh_item_swap(s1, s2); - } - } - } - } - } -} - -ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, - OBVHStackItem *ccl_restrict s2, - OBVHStackItem *ccl_restrict s3, - OBVHStackItem *ccl_restrict s4, - OBVHStackItem *ccl_restrict s5, - OBVHStackItem *ccl_restrict s6, - OBVHStackItem *ccl_restrict s7) -{ - obvh_stack_sort(s1, s2, s3, s4, s5, s6); - if (s7->dist < s6->dist) { - obvh_item_swap(s6, s7); - if (s6->dist < s5->dist) { - obvh_item_swap(s5, s6); - if (s5->dist < s4->dist) { - obvh_item_swap(s4, s5); - if (s4->dist < s3->dist) { - obvh_item_swap(s3, s4); - if (s3->dist < s2->dist) { - obvh_item_swap(s2, s3); - if (s2->dist < s1->dist) { - obvh_item_swap(s1, s2); - } - } - } - } - } - } -} - -ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, - OBVHStackItem *ccl_restrict s2, - OBVHStackItem *ccl_restrict s3, - OBVHStackItem *ccl_restrict s4, - OBVHStackItem *ccl_restrict s5, - OBVHStackItem *ccl_restrict s6, - OBVHStackItem *ccl_restrict s7, - OBVHStackItem *ccl_restrict s8) -{ - obvh_stack_sort(s1, s2, s3, s4, s5, s6, s7); - if (s8->dist < s7->dist) { - obvh_item_swap(s7, s8); - if (s7->dist < s6->dist) { - obvh_item_swap(s6, s7); - if (s6->dist < s5->dist) { - obvh_item_swap(s5, s6); - if (s5->dist < s4->dist) { - obvh_item_swap(s4, s5); - if (s4->dist < s3->dist) { - obvh_item_swap(s3, s4); - if (s3->dist < s2->dist) { - obvh_item_swap(s2, s3); - if (s2->dist < s1->dist) { - obvh_item_swap(s1, s2); - } - } - } - } - } - } - } -} - -/* Axis-aligned nodes intersection */ - -ccl_device_inline int obvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, - const avxf &isect_near, - const avxf &isect_far, -#ifdef __KERNEL_AVX2__ - const avx3f &org_idir, -#else - const avx3f &org, -#endif - const avx3f &idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - avxf *ccl_restrict dist) -{ - const int offset = node_addr + 2; -#ifdef __KERNEL_AVX2__ - const avxf tnear_x = msub( - kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, org_idir.x); - const avxf tnear_y = msub( - kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, org_idir.y); - const avxf tnear_z = msub( - kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, org_idir.z); - const avxf tfar_x = msub( - kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, org_idir.x); - const avxf tfar_y = msub( - kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, org_idir.y); - const avxf tfar_z = msub( - kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, org_idir.z); - - const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); - const avxb vmask = tnear <= tfar; - int mask = (int)movemask(vmask); - *dist = tnear; - return mask; -#else - return 0; -#endif -} - -/* Unaligned nodes intersection */ - -ccl_device_inline int obvh_unaligned_node_intersect(KernelGlobals *ccl_restrict kg, - const avxf &isect_near, - const avxf &isect_far, -#ifdef __KERNEL_AVX2__ - const avx3f &org_idir, -#endif - const avx3f &org, - const avx3f &dir, - const avx3f &idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - avxf *ccl_restrict dist) -{ - const int offset = node_addr; - const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 2); - const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 4); - const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 6); - - const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 8); - const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 10); - const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 12); - - const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 14); - const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 16); - const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 18); - - const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 20); - const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 22); - const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 24); - - const avxf aligned_dir_x = dir.x * tfm_x_x + dir.y * tfm_x_y + dir.z * tfm_x_z, - aligned_dir_y = dir.x * tfm_y_x + dir.y * tfm_y_y + dir.z * tfm_y_z, - aligned_dir_z = dir.x * tfm_z_x + dir.y * tfm_z_y + dir.z * tfm_z_z; - - const avxf aligned_P_x = org.x * tfm_x_x + org.y * tfm_x_y + org.z * tfm_x_z + tfm_t_x, - aligned_P_y = org.x * tfm_y_x + org.y * tfm_y_y + org.z * tfm_y_z + tfm_t_y, - aligned_P_z = org.x * tfm_z_x + org.y * tfm_z_y + org.z * tfm_z_z + tfm_t_z; - - const avxf neg_one(-1.0f); - const avxf nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y, - nrdir_z = neg_one / aligned_dir_z; - - const avxf tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y, - tlower_z = aligned_P_z * nrdir_z; - - const avxf tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y, - tupper_z = tlower_z - nrdir_z; - - const avxf tnear_x = min(tlower_x, tupper_x); - const avxf tnear_y = min(tlower_y, tupper_y); - const avxf tnear_z = min(tlower_z, tupper_z); - const avxf tfar_x = max(tlower_x, tupper_x); - const avxf tfar_y = max(tlower_y, tupper_y); - const avxf tfar_z = max(tlower_z, tupper_z); - const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - const avxb vmask = tnear <= tfar; - *dist = tnear; - return movemask(vmask); -} - -/* Intersectors wrappers. - * - * They'll check node type and call appropriate intersection code. - */ - -ccl_device_inline int obvh_node_intersect(KernelGlobals *ccl_restrict kg, - const avxf &isect_near, - const avxf &isect_far, -#ifdef __KERNEL_AVX2__ - const avx3f &org_idir, -#endif - const avx3f &org, - const avx3f &dir, - const avx3f &idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - avxf *ccl_restrict dist) -{ - const int offset = node_addr; - const float4 node = kernel_tex_fetch(__bvh_nodes, offset); - if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return obvh_unaligned_node_intersect(kg, - isect_near, - isect_far, -#ifdef __KERNEL_AVX2__ - org_idir, -#endif - org, - dir, - idir, - near_x, - near_y, - near_z, - far_x, - far_y, - far_z, - node_addr, - dist); - } - else { - return obvh_aligned_node_intersect(kg, - isect_near, - isect_far, -#ifdef __KERNEL_AVX2__ - org_idir, -#else - org, -#endif - idir, - near_x, - near_y, - near_z, - far_x, - far_y, - far_z, - node_addr, - dist); - } -} diff --git a/intern/cycles/kernel/bvh/obvh_shadow_all.h b/intern/cycles/kernel/bvh/obvh_shadow_all.h deleted file mode 100644 index b7ab75b723c..00000000000 --- a/intern/cycles/kernel/bvh/obvh_shadow_all.h +++ /dev/null @@ -1,664 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* This is a template BVH traversal function, where various features can be - * enabled/disabled. This way we can compile optimized versions for each case - * without new features slowing things down. - * - * BVH_INSTANCING: object instancing - * BVH_HAIR: hair curve rendering - * BVH_MOTION: motion blur rendering - */ - -#if BVH_FEATURE(BVH_HAIR) -# define NODE_INTERSECT obvh_node_intersect -#else -# define NODE_INTERSECT obvh_aligned_node_intersect -#endif - -ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect_array, - const int skip_object, - const uint max_hits, - uint *num_hits) -{ - /* TODO(sergey): - * - Test if pushing distance on the stack helps. - * - Likely and unlikely for if() statements. - * - Test restrict attribute for pointers. - */ - - /* Traversal stack in CUDA thread-local memory. */ - OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - - /* Ray parameters in registers. */ - const float tmax = ray->t; - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - float isect_t = tmax; - -#if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; -#endif - - *num_hits = 0; - isect_array->t = tmax; - -#if BVH_FEATURE(BVH_INSTANCING) - int num_hits_in_instance = 0; -#endif - - avxf tnear(0.0f), tfar(isect_t); -#if BVH_FEATURE(BVH_HAIR) - avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); -#endif - avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); - -#ifdef __KERNEL_AVX2__ - float3 P_idir = P * idir; - avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); -#endif - - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); - (void)inodes; - - if (false -#ifdef __VISIBILITY_FLAG__ - || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) -#endif -#if BVH_FEATURE(BVH_MOTION) - || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z) -#endif - ) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - avxf dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, -#ifdef __KERNEL_AVX2__ - P_idir4, -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - //#if !defined(__KERNEL_AVX2__) - org4, -#endif -#if BVH_FEATURE(BVH_HAIR) - dir4, -#endif - idir4, - near_x, - near_y, - near_z, - far_x, - far_y, - far_z, - node_addr, - &dist); - - if (child_mask != 0) { - avxf cnodes; -#if BVH_FEATURE(BVH_HAIR) - if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26); - } - else -#endif - { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if (child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float *)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float *)&dist)[r]; - if (child_mask == 0) { - if (d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - - /* Five children are hit, push all onto stack and sort 5 - * stack items, continue with closest child - */ - r = __bscf(child_mask); - int c4 = __float_as_int(cnodes[r]); - float d4 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Six children are hit, push all onto stack and sort 6 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c5 = __float_as_int(cnodes[r]); - float d5 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - - /* Seven children are hit, push all onto stack and sort 7 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c6 = __float_as_int(cnodes[r]); - float d6 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Eight children are hit, push all onto stack and sort 8 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c7 = __float_as_int(cnodes[r]); - float d7 = ((float *)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c7; - traversal_stack[stack_ptr].dist = d7; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6], - &traversal_stack[stack_ptr - 7]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if (node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); -#ifdef __VISIBILITY_FLAG__ - if ((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } -#endif - - int prim_addr = __float_as_int(leaf.x); - -#if BVH_FEATURE(BVH_INSTANCING) - if (prim_addr >= 0) { -#endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - const uint p_type = type & PRIMITIVE_ALL; - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - - /* Primitive intersection. */ - if (p_type == PRIMITIVE_TRIANGLE) { - int prim_count = prim_addr2 - prim_addr; - if (prim_count < 3) { - while (prim_addr < prim_addr2) { - kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == - p_type); - int hit = triangle_intersect( - kg, isect_array, P, dir, PATH_RAY_SHADOW, object, prim_addr); - /* Shadow ray early termination. */ - if (hit) { - /* detect if this surface has a shader with transparent shadows */ - - /* todo: optimize so primitive visibility flag indicates if - * the primitive has a transparent shadow shader? */ - int prim = kernel_tex_fetch(__prim_index, isect_array->prim); - int shader = 0; - -#ifdef __HAIR__ - if (kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) -#endif - { - shader = kernel_tex_fetch(__tri_shader, prim); - } -#ifdef __HAIR__ - else { - float4 str = kernel_tex_fetch(__curves, prim); - shader = __float_as_int(str.z); - } -#endif - int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; - - /* if no transparent shadows, all light is blocked */ - if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) { - return true; - } - /* if maximum number of hits reached, block all light */ - else if (*num_hits == max_hits) { - return true; - } - - /* move on to next entry in intersections array */ - isect_array++; - (*num_hits)++; -#if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; -#endif - - isect_array->t = isect_t; - } - - prim_addr++; - } // while - } - else { - kernel_assert((kernel_tex_fetch(__prim_type, (prim_addr)) & PRIMITIVE_ALL) == - p_type); - -#if BVH_FEATURE(BVH_INSTANCING) - int *nhiptr = &num_hits_in_instance; -#else - int nhi = 0; - int *nhiptr = &nhi; -#endif - - int result = triangle_intersect8(kg, - &isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - prim_count, - num_hits, - max_hits, - nhiptr, - isect_t); - if (result == 2) { - return true; - } - } // prim_count - } // PRIMITIVE_TRIANGLE - else { - while (prim_addr < prim_addr2) { - kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); - -#ifdef __SHADOW_TRICKS__ - uint tri_object = (object == OBJECT_NONE) ? - kernel_tex_fetch(__prim_object, prim_addr) : - object; - if (tri_object == skip_object) { - ++prim_addr; - continue; - } -#endif - - bool hit; - - /* todo: specialized intersect functions which don't fill in - * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW? - * might give a few % performance improvement */ - - switch (p_type) { - -#if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - hit = motion_triangle_intersect( - kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, prim_addr); - break; - } -#endif -#if BVH_FEATURE(BVH_HAIR) - case PRIMITIVE_CURVE: - case PRIMITIVE_MOTION_CURVE: { - const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); - if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = cardinal_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type); - } - else { - hit = curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type); - } - break; - } -#endif - default: { - hit = false; - break; - } - } - - /* Shadow ray early termination. */ - if (hit) { - /* detect if this surface has a shader with transparent shadows */ - - /* todo: optimize so primitive visibility flag indicates if - * the primitive has a transparent shadow shader? */ - int prim = kernel_tex_fetch(__prim_index, isect_array->prim); - int shader = 0; - -#ifdef __HAIR__ - if (kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) -#endif - { - shader = kernel_tex_fetch(__tri_shader, prim); - } -#ifdef __HAIR__ - else { - float4 str = kernel_tex_fetch(__curves, prim); - shader = __float_as_int(str.z); - } -#endif - int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; - - /* if no transparent shadows, all light is blocked */ - if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) { - return true; - } - /* if maximum number of hits reached, block all light */ - else if (*num_hits == max_hits) { - return true; - } - - /* move on to next entry in intersections array */ - isect_array++; - (*num_hits)++; -#if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; -#endif - - isect_array->t = isect_t; - } - - prim_addr++; - } // while prim - } - } -#if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr - 1); - -# if BVH_FEATURE(BVH_MOTION) - isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); -# else - isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); -# endif - - num_hits_in_instance = 0; - isect_array->t = isect_t; - - obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = avxf(isect_t); -# if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); -# endif - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); -# endif - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - - node_addr = kernel_tex_fetch(__object_node, object); - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - -#if BVH_FEATURE(BVH_INSTANCING) - if (stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); - - /* Instance pop. */ - if (num_hits_in_instance) { - float t_fac; -# if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); -# else - bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); -# endif - /* Scale isect->t to adjust for instancing. */ - for (int i = 0; i < num_hits_in_instance; i++) { - (isect_array - i - 1)->t *= t_fac; - } - } - else { -# if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); -# else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); -# endif - } - - isect_t = tmax; - isect_array->t = isect_t; - - obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = avxf(isect_t); -# if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); -# endif - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); -# endif - - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - - return false; -} - -#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/obvh_traversal.h b/intern/cycles/kernel/bvh/obvh_traversal.h deleted file mode 100644 index 9095233f8b6..00000000000 --- a/intern/cycles/kernel/bvh/obvh_traversal.h +++ /dev/null @@ -1,557 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* This is a template BVH traversal function, where various features can be - * enabled/disabled. This way we can compile optimized versions for each case - * without new features slowing things down. - * - * BVH_INSTANCING: object instancing - * BVH_HAIR: hair curve rendering - * BVH_MOTION: motion blur rendering - */ - -#if BVH_FEATURE(BVH_HAIR) -# define NODE_INTERSECT obvh_node_intersect -#else -# define NODE_INTERSECT obvh_aligned_node_intersect -#endif - -ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect, - const uint visibility) -{ - /* Traversal stack in CUDA thread-local memory. */ - OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - traversal_stack[0].dist = -FLT_MAX; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - float node_dist = -FLT_MAX; - - /* Ray parameters in registers. */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - -#if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; -#endif - - isect->t = ray->t; - isect->u = 0.0f; - isect->v = 0.0f; - isect->prim = PRIM_NONE; - isect->object = OBJECT_NONE; - - BVH_DEBUG_INIT(); - avxf tnear(0.0f), tfar(ray->t); -#if BVH_FEATURE(BVH_HAIR) - avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); -#endif - avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); - -#ifdef __KERNEL_AVX2__ - float3 P_idir = P * idir; - avx3f P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - avx3f org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); -#endif - - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); - (void)inodes; - - if (UNLIKELY(node_dist > isect->t) -#if BVH_FEATURE(BVH_MOTION) - || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z) -#endif -#ifdef __VISIBILITY_FLAG__ - || (__float_as_uint(inodes.x) & visibility) == 0 -#endif - ) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - int child_mask; - avxf dist; - - BVH_DEBUG_NEXT_NODE(); - - { - child_mask = NODE_INTERSECT(kg, - tnear, - tfar, -#ifdef __KERNEL_AVX2__ - P_idir4, -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, -#endif -#if BVH_FEATURE(BVH_HAIR) - dir4, -#endif - idir4, - near_x, - near_y, - near_z, - far_x, - far_y, - far_z, - node_addr, - &dist); - } - - if (child_mask != 0) { - avxf cnodes; - /* TODO(sergey): Investigate whether moving cnodes upwards - * gives a speedup (will be different cache pattern but will - * avoid extra check here). - */ -#if BVH_FEATURE(BVH_HAIR) - if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26); - } - else -#endif - { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - float d0 = ((float *)&dist)[r]; - if (child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - node_dist = d0; - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float *)&dist)[r]; - if (child_mask == 0) { - if (d1 < d0) { - node_addr = c1; - node_dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - node_dist = d0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - - /* Five children are hit, push all onto stack and sort 5 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c4 = __float_as_int(cnodes[r]); - float d4 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4]); - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - /* Six children are hit, push all onto stack and sort 6 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c5 = __float_as_int(cnodes[r]); - float d5 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5]); - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - - /* Seven children are hit, push all onto stack and sort 7 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c6 = __float_as_int(cnodes[r]); - float d6 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6]); - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - /* Eight children are hit, push all onto stack and sort 8 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c7 = __float_as_int(cnodes[r]); - float d7 = ((float *)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c7; - traversal_stack[stack_ptr].dist = d7; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6], - &traversal_stack[stack_ptr - 7]); - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if (node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); - -#ifdef __VISIBILITY_FLAG__ - if (UNLIKELY((node_dist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0))) -#else - if (UNLIKELY((node_dist > isect->t))) -#endif - { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - int prim_addr = __float_as_int(leaf.x); - -#if BVH_FEATURE(BVH_INSTANCING) - if (prim_addr >= 0) { -#endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - - /* Primitive intersection. */ - switch (type & PRIMITIVE_ALL) { - case PRIMITIVE_TRIANGLE: { - int prim_count = prim_addr2 - prim_addr; - if (prim_count < 3) { - for (; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_INTERSECTION(); - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) { - tfar = avxf(isect->t); - /* Shadow ray early termination. */ - if (visibility == PATH_RAY_SHADOW_OPAQUE) { - return true; - } - } - } // for - } - else { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if (triangle_intersect8(kg, - &isect, - P, - dir, - visibility, - object, - prim_addr, - prim_count, - 0, - 0, - NULL, - 0.0f)) { - tfar = avxf(isect->t); - if (visibility == PATH_RAY_SHADOW_OPAQUE) { - return true; - } - } - } // prim count - break; - } -#if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - for (; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_INTERSECTION(); - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if (motion_triangle_intersect( - kg, isect, P, dir, ray->time, visibility, object, prim_addr)) { - tfar = avxf(isect->t); - /* Shadow ray early termination. */ - if (visibility == PATH_RAY_SHADOW_OPAQUE) { - return true; - } - } - } - break; - } -#endif /* BVH_FEATURE(BVH_MOTION) */ -#if BVH_FEATURE(BVH_HAIR) - case PRIMITIVE_CURVE: - case PRIMITIVE_MOTION_CURVE: { - for (; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_INTERSECTION(); - const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); - kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); - bool hit; - if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = cardinal_curve_intersect( - kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type); - } - else { - hit = curve_intersect( - kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type); - } - if (hit) { - tfar = avxf(isect->t); - /* Shadow ray early termination. */ - if (visibility == PATH_RAY_SHADOW_OPAQUE) { - return true; - } - } - } - break; - } -#endif /* BVH_FEATURE(BVH_HAIR) */ - } - } -#if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr - 1); - -# if BVH_FEATURE(BVH_MOTION) - qbvh_instance_motion_push( - kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm); -# else - qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist); -# endif - - obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = avxf(isect->t); -# if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); -# endif - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); -# endif - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - traversal_stack[stack_ptr].dist = -FLT_MAX; - - node_addr = kernel_tex_fetch(__object_node, object); - - BVH_DEBUG_NEXT_INSTANCE(); - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - -#if BVH_FEATURE(BVH_INSTANCING) - if (stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); - - /* Instance pop. */ -# if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); -# else - isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); -# endif - - obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = avxf(isect->t); -# if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); -# endif - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); -# endif - - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - - return (isect->prim != PRIM_NONE); -} - -#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/obvh_volume.h b/intern/cycles/kernel/bvh/obvh_volume.h deleted file mode 100644 index fb41ae783ab..00000000000 --- a/intern/cycles/kernel/bvh/obvh_volume.h +++ /dev/null @@ -1,480 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* This is a template BVH traversal function for volumes, where - * various features can be enabled/disabled. This way we can compile optimized - * versions for each case without new features slowing things down. - * - * BVH_INSTANCING: object instancing - * BVH_MOTION: motion blur rendering - */ - -#if BVH_FEATURE(BVH_HAIR) -# define NODE_INTERSECT obvh_node_intersect -#else -# define NODE_INTERSECT obvh_aligned_node_intersect -#endif - -ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect, - const uint visibility) -{ - /* Traversal stack in CUDA thread-local memory. */ - OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - - /* Ray parameters in registers. */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - -#if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; -#endif - - isect->t = ray->t; - isect->u = 0.0f; - isect->v = 0.0f; - isect->prim = PRIM_NONE; - isect->object = OBJECT_NONE; - - avxf tnear(0.0f), tfar(ray->t); -#if BVH_FEATURE(BVH_HAIR) - avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); -#endif - avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); - -#ifdef __KERNEL_AVX2__ - float3 P_idir = P * idir; - avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); -#endif - - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); - -#ifdef __VISIBILITY_FLAG__ - if ((__float_as_uint(inodes.x) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } -#endif - - avxf dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, -#ifdef __KERNEL_AVX2__ - P_idir4, -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, -#endif -#if BVH_FEATURE(BVH_HAIR) - dir4, -#endif - idir4, - near_x, - near_y, - near_z, - far_x, - far_y, - far_z, - node_addr, - &dist); - - if (child_mask != 0) { - avxf cnodes; -#if BVH_FEATURE(BVH_HAIR) - if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26); - } - else -#endif - { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if (child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float *)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float *)&dist)[r]; - if (child_mask == 0) { - if (d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - - /* Five children are hit, push all onto stack and sort 5 - * stack items, continue with closest child - */ - r = __bscf(child_mask); - int c4 = __float_as_int(cnodes[r]); - float d4 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Six children are hit, push all onto stack and sort 6 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c5 = __float_as_int(cnodes[r]); - float d5 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - - /* Seven children are hit, push all onto stack and sort 7 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c6 = __float_as_int(cnodes[r]); - float d6 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Eight children are hit, push all onto stack and sort 8 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c7 = __float_as_int(cnodes[r]); - float d7 = ((float *)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c7; - traversal_stack[stack_ptr].dist = d7; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6], - &traversal_stack[stack_ptr - 7]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if (node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); - - if ((__float_as_uint(leaf.z) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - int prim_addr = __float_as_int(leaf.x); - -#if BVH_FEATURE(BVH_INSTANCING) - if (prim_addr >= 0) { -#endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - const uint p_type = type & PRIMITIVE_ALL; - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - - /* Primitive intersection. */ - switch (p_type) { - case PRIMITIVE_TRIANGLE: { - for (; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE) ? - kernel_tex_fetch(__prim_object, prim_addr) : - object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr); - } - break; - } -#if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - for (; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE) ? - kernel_tex_fetch(__prim_object, prim_addr) : - object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - motion_triangle_intersect( - kg, isect, P, dir, ray->time, visibility, object, prim_addr); - } - break; - } -#endif - } - } -#if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr - 1); - int object_flag = kernel_tex_fetch(__object_flag, object); - if (object_flag & SD_OBJECT_HAS_VOLUME) { -# if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_push( - kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); -# else - isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); -# endif - - obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = avxf(isect->t); -# if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); -# endif - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); -# endif - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - - node_addr = kernel_tex_fetch(__object_node, object); - } - else { - /* Pop. */ - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - -#if BVH_FEATURE(BVH_INSTANCING) - if (stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); - - /* Instance pop. */ -# if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); -# else - isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); -# endif - - obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = avxf(isect->t); -# if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); -# endif - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); -# endif - - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - - return (isect->prim != PRIM_NONE); -} - -#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/obvh_volume_all.h b/intern/cycles/kernel/bvh/obvh_volume_all.h deleted file mode 100644 index 56e2afd4a11..00000000000 --- a/intern/cycles/kernel/bvh/obvh_volume_all.h +++ /dev/null @@ -1,551 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* This is a template BVH traversal function for volumes, where - * various features can be enabled/disabled. This way we can compile optimized - * versions for each case without new features slowing things down. - * - * BVH_INSTANCING: object instancing - * BVH_MOTION: motion blur rendering - */ - -#if BVH_FEATURE(BVH_HAIR) -# define NODE_INTERSECT obvh_node_intersect -#else -# define NODE_INTERSECT obvh_aligned_node_intersect -#endif - -ccl_device uint BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect_array, - const uint max_hits, - const uint visibility) -{ - /* Traversal stack in CUDA thread-local memory. */ - OBVHStackItem traversal_stack[BVH_OSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - - /* Ray parameters in registers. */ - const float tmax = ray->t; - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - float isect_t = tmax; - -#if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; -#endif - - uint num_hits = 0; - isect_array->t = tmax; - -#if BVH_FEATURE(BVH_INSTANCING) - int num_hits_in_instance = 0; -#endif - - avxf tnear(0.0f), tfar(isect_t); -#if BVH_FEATURE(BVH_HAIR) - avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); -#endif - avx3f idir4(avxf(idir.x), avxf(idir.y), avxf(idir.z)); - -#ifdef __KERNEL_AVX2__ - float3 P_idir = P * idir; - avx3f P_idir4(P_idir.x, P_idir.y, P_idir.z); -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - avx3f org4(avxf(P.x), avxf(P.y), avxf(P.z)); -#endif - - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); - -#ifdef __VISIBILITY_FLAG__ - if ((__float_as_uint(inodes.x) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } -#endif - - avxf dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, -#ifdef __KERNEL_AVX2__ - P_idir4, -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, -#endif -#if BVH_FEATURE(BVH_HAIR) - dir4, -#endif - idir4, - near_x, - near_y, - near_z, - far_x, - far_y, - far_z, - node_addr, - &dist); - - if (child_mask != 0) { - avxf cnodes; -#if BVH_FEATURE(BVH_HAIR) - if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 26); - } - else -#endif - { - cnodes = kernel_tex_fetch_avxf(__bvh_nodes, node_addr + 14); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if (child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float *)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float *)&dist)[r]; - if (child_mask == 0) { - if (d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - - /* Five children are hit, push all onto stack and sort 5 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c4 = __float_as_int(cnodes[r]); - float d4 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Six children are hit, push all onto stack and sort 6 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c5 = __float_as_int(cnodes[r]); - float d5 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c5; - traversal_stack[stack_ptr].dist = d5; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c4; - traversal_stack[stack_ptr].dist = d4; - - /* Seven children are hit, push all onto stack and sort 7 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c6 = __float_as_int(cnodes[r]); - float d6 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Eight children are hit, push all onto stack and sort 8 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c7 = __float_as_int(cnodes[r]); - float d7 = ((float *)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c7; - traversal_stack[stack_ptr].dist = d7; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = c6; - traversal_stack[stack_ptr].dist = d6; - obvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3], - &traversal_stack[stack_ptr - 4], - &traversal_stack[stack_ptr - 5], - &traversal_stack[stack_ptr - 6], - &traversal_stack[stack_ptr - 7]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if (node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); - - if ((__float_as_uint(leaf.z) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - int prim_addr = __float_as_int(leaf.x); - -#if BVH_FEATURE(BVH_INSTANCING) - if (prim_addr >= 0) { -#endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - const uint p_type = type & PRIMITIVE_ALL; - bool hit; - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - - /* Primitive intersection. */ - switch (p_type) { - case PRIMITIVE_TRIANGLE: { - for (; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE) ? - kernel_tex_fetch(__prim_object, prim_addr) : - object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); - if (hit) { - /* Move on to next entry in intersections array. */ - isect_array++; - num_hits++; -#if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; -#endif - isect_array->t = isect_t; - if (num_hits == max_hits) { -#if BVH_FEATURE(BVH_INSTANCING) -# if BVH_FEATURE(BVH_MOTION) - float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); -# else - Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); - float t_fac = 1.0f / len(transform_direction(&itfm, dir)); -# endif - for (int i = 0; i < num_hits_in_instance; i++) { - (isect_array - i - 1)->t *= t_fac; - } -#endif /* BVH_FEATURE(BVH_INSTANCING) */ - return num_hits; - } - } - } - break; - } -#if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - for (; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE) ? - kernel_tex_fetch(__prim_object, prim_addr) : - object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - hit = motion_triangle_intersect( - kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); - if (hit) { - /* Move on to next entry in intersections array. */ - isect_array++; - num_hits++; -# if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; -# endif - isect_array->t = isect_t; - if (num_hits == max_hits) { -# if BVH_FEATURE(BVH_INSTANCING) -# if BVH_FEATURE(BVH_MOTION) - float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); -# else - Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); - float t_fac = 1.0f / len(transform_direction(&itfm, dir)); -# endif - for (int i = 0; i < num_hits_in_instance; i++) { - (isect_array - i - 1)->t *= t_fac; - } -# endif /* BVH_FEATURE(BVH_INSTANCING) */ - return num_hits; - } - } - } - break; - } -#endif - } - } -#if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr - 1); - int object_flag = kernel_tex_fetch(__object_flag, object); - if (object_flag & SD_OBJECT_HAS_VOLUME) { -# if BVH_FEATURE(BVH_MOTION) - isect_t = bvh_instance_motion_push( - kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); -# else - isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); -# endif - - obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = avxf(isect_t); - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); -# if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); -# endif -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); -# endif - - num_hits_in_instance = 0; - isect_array->t = isect_t; - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_OSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - - node_addr = kernel_tex_fetch(__object_node, object); - } - else { - /* Pop. */ - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - -#if BVH_FEATURE(BVH_INSTANCING) - if (stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); - - /* Instance pop. */ - if (num_hits_in_instance) { - float t_fac; -# if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); -# else - bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); -# endif - /* Scale isect->t to adjust for instancing. */ - for (int i = 0; i < num_hits_in_instance; i++) { - (isect_array - i - 1)->t *= t_fac; - } - } - else { -# if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); -# else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); -# endif - } - - isect_t = tmax; - isect_array->t = isect_t; - - obvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = avxf(isect_t); -# if BVH_FEATURE(BVH_HAIR) - dir4 = avx3f(avxf(dir.x), avxf(dir.y), avxf(dir.z)); -# endif - idir4 = avx3f(avxf(idir.x), avxf(idir.y), avxf(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = avx3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = avx3f(avxf(P.x), avxf(P.y), avxf(P.z)); -# endif - - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - - return num_hits; -} - -#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_local.h b/intern/cycles/kernel/bvh/qbvh_local.h deleted file mode 100644 index b21f79bd3a0..00000000000 --- a/intern/cycles/kernel/bvh/qbvh_local.h +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* This is a template BVH traversal function for finding local intersections - * around the shading point, for subsurface scattering and bevel. We disable - * various features for performance, and for instanced objects avoid traversing - * other parts of the scene. - * - * BVH_MOTION: motion blur rendering - */ - -#if BVH_FEATURE(BVH_HAIR) -# define NODE_INTERSECT qbvh_node_intersect -#else -# define NODE_INTERSECT qbvh_aligned_node_intersect -#endif - -ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, - const Ray *ray, - LocalIntersection *local_isect, - int local_object, - uint *lcg_state, - int max_hits) -{ - /* TODO(sergey): - * - Test if pushing distance on the stack helps (for non shadow rays). - * - Separate version for shadow rays. - * - Likely and unlikely for if() statements. - * - SSE for hair. - * - Test restrict attribute for pointers. - */ - - /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_tex_fetch(__object_node, local_object); - - /* Ray parameters in registers. */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - float isect_t = ray->t; - - if (local_isect != NULL) { - local_isect->num_hits = 0; - } - kernel_assert((local_isect == NULL) == (max_hits == 0)); - - const int object_flag = kernel_tex_fetch(__object_flag, local_object); - if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { -#if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; - isect_t = bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, isect_t, &ob_itfm); -#else - isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t); -#endif - object = local_object; - } - - ssef tnear(0.0f), tfar(isect_t); -#if BVH_FEATURE(BVH_HAIR) - sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); -#endif - sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); - -#ifdef __KERNEL_AVX2__ - float3 P_idir = P * idir; - sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); -#endif - - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - ssef dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, -#ifdef __KERNEL_AVX2__ - P_idir4, -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, -#endif -#if BVH_FEATURE(BVH_HAIR) - dir4, -#endif - idir4, - near_x, - near_y, - near_z, - far_x, - far_y, - far_z, - node_addr, - &dist); - - if (child_mask != 0) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); - float4 cnodes; -#if BVH_FEATURE(BVH_HAIR) - if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13); - } - else -#endif - { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if (child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float *)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float *)&dist)[r]; - if (child_mask == 0) { - if (d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float *)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - } - - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if (node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); - int prim_addr = __float_as_int(leaf.x); - - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - - /* Primitive intersection. */ - switch (type & PRIMITIVE_ALL) { - case PRIMITIVE_TRIANGLE: { - /* Intersect ray against primitive, */ - for (; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if (triangle_intersect_local(kg, - local_isect, - P, - dir, - object, - local_object, - prim_addr, - isect_t, - lcg_state, - max_hits)) { - return true; - } - } - break; - } -#if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - /* Intersect ray against primitive. */ - for (; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if (motion_triangle_intersect_local(kg, - local_isect, - P, - dir, - ray->time, - object, - local_object, - prim_addr, - isect_t, - lcg_state, - max_hits)) { - return true; - } - } - break; - } -#endif - default: - break; - } - } - } while (node_addr != ENTRYPOINT_SENTINEL); - } while (node_addr != ENTRYPOINT_SENTINEL); - - return false; -} - -#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h deleted file mode 100644 index 070406fb18a..00000000000 --- a/intern/cycles/kernel/bvh/qbvh_nodes.h +++ /dev/null @@ -1,329 +0,0 @@ -/* - * Copyright 2011-2014, Blender Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * Aligned nodes intersection SSE code is adopted from Embree, - */ - -struct QBVHStackItem { - int addr; - float dist; -}; - -ccl_device_inline void qbvh_near_far_idx_calc(const float3 &idir, - int *ccl_restrict near_x, - int *ccl_restrict near_y, - int *ccl_restrict near_z, - int *ccl_restrict far_x, - int *ccl_restrict far_y, - int *ccl_restrict far_z) - -{ -#ifdef __KERNEL_SSE__ - *near_x = 0; - *far_x = 1; - *near_y = 2; - *far_y = 3; - *near_z = 4; - *far_z = 5; - - const size_t mask = movemask(ssef(idir.m128)); - - const int mask_x = mask & 1; - const int mask_y = (mask & 2) >> 1; - const int mask_z = (mask & 4) >> 2; - - *near_x += mask_x; - *far_x -= mask_x; - *near_y += mask_y; - *far_y -= mask_y; - *near_z += mask_z; - *far_z -= mask_z; -#else - if (idir.x >= 0.0f) { - *near_x = 0; - *far_x = 1; - } - else { - *near_x = 1; - *far_x = 0; - } - if (idir.y >= 0.0f) { - *near_y = 2; - *far_y = 3; - } - else { - *near_y = 3; - *far_y = 2; - } - if (idir.z >= 0.0f) { - *near_z = 4; - *far_z = 5; - } - else { - *near_z = 5; - *far_z = 4; - } -#endif -} - -/* TOOD(sergey): Investigate if using intrinsics helps for both - * stack item swap and float comparison. - */ -ccl_device_inline void qbvh_item_swap(QBVHStackItem *ccl_restrict a, QBVHStackItem *ccl_restrict b) -{ - QBVHStackItem tmp = *a; - *a = *b; - *b = tmp; -} - -ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1, - QBVHStackItem *ccl_restrict s2, - QBVHStackItem *ccl_restrict s3) -{ - if (s2->dist < s1->dist) { - qbvh_item_swap(s2, s1); - } - if (s3->dist < s2->dist) { - qbvh_item_swap(s3, s2); - } - if (s2->dist < s1->dist) { - qbvh_item_swap(s2, s1); - } -} - -ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1, - QBVHStackItem *ccl_restrict s2, - QBVHStackItem *ccl_restrict s3, - QBVHStackItem *ccl_restrict s4) -{ - if (s2->dist < s1->dist) { - qbvh_item_swap(s2, s1); - } - if (s4->dist < s3->dist) { - qbvh_item_swap(s4, s3); - } - if (s3->dist < s1->dist) { - qbvh_item_swap(s3, s1); - } - if (s4->dist < s2->dist) { - qbvh_item_swap(s4, s2); - } - if (s3->dist < s2->dist) { - qbvh_item_swap(s3, s2); - } -} - -/* Axis-aligned nodes intersection */ - -// ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, -static int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, - const ssef &isect_near, - const ssef &isect_far, -#ifdef __KERNEL_AVX2__ - const sse3f &org_idir, -#else - const sse3f &org, -#endif - const sse3f &idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - ssef *ccl_restrict dist) -{ - const int offset = node_addr + 1; -#ifdef __KERNEL_AVX2__ - const ssef tnear_x = msub( - kernel_tex_fetch_ssef(__bvh_nodes, offset + near_x), idir.x, org_idir.x); - const ssef tnear_y = msub( - kernel_tex_fetch_ssef(__bvh_nodes, offset + near_y), idir.y, org_idir.y); - const ssef tnear_z = msub( - kernel_tex_fetch_ssef(__bvh_nodes, offset + near_z), idir.z, org_idir.z); - const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_x), idir.x, org_idir.x); - const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_y), idir.y, org_idir.y); - const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset + far_z), idir.z, org_idir.z); -#else - const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_x) - org.x) * idir.x; - const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_y) - org.y) * idir.y; - const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset + near_z) - org.z) * idir.z; - const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_x) - org.x) * idir.x; - const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_y) - org.y) * idir.y; - const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset + far_z) - org.z) * idir.z; -#endif - -#ifdef __KERNEL_SSE41__ - const ssef tnear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, isect_near)); - const ssef tfar = mini(mini(tfar_x, tfar_y), mini(tfar_z, isect_far)); - const sseb vmask = cast(tnear) > cast(tfar); - int mask = (int)movemask(vmask) ^ 0xf; -#else - const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - const sseb vmask = tnear <= tfar; - int mask = (int)movemask(vmask); -#endif - *dist = tnear; - return mask; -} - -/* Unaligned nodes intersection */ - -ccl_device_inline int qbvh_unaligned_node_intersect(KernelGlobals *ccl_restrict kg, - const ssef &isect_near, - const ssef &isect_far, -#ifdef __KERNEL_AVX2__ - const sse3f &org_idir, -#endif - const sse3f &org, - const sse3f &dir, - const sse3f &idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - ssef *ccl_restrict dist) -{ - const int offset = node_addr; - const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 1); - const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 2); - const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 3); - - const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 4); - const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 5); - const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 6); - - const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 7); - const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 8); - const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 9); - - const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset + 10); - const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset + 11); - const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset + 12); - - const ssef aligned_dir_x = dir.x * tfm_x_x + dir.y * tfm_x_y + dir.z * tfm_x_z, - aligned_dir_y = dir.x * tfm_y_x + dir.y * tfm_y_y + dir.z * tfm_y_z, - aligned_dir_z = dir.x * tfm_z_x + dir.y * tfm_z_y + dir.z * tfm_z_z; - - const ssef aligned_P_x = org.x * tfm_x_x + org.y * tfm_x_y + org.z * tfm_x_z + tfm_t_x, - aligned_P_y = org.x * tfm_y_x + org.y * tfm_y_y + org.z * tfm_y_z + tfm_t_y, - aligned_P_z = org.x * tfm_z_x + org.y * tfm_z_y + org.z * tfm_z_z + tfm_t_z; - - const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f); - const ssef nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y, - nrdir_z = neg_one / aligned_dir_z; - - const ssef tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y, - tlower_z = aligned_P_z * nrdir_z; - - const ssef tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y, - tupper_z = tlower_z - nrdir_z; - -#ifdef __KERNEL_SSE41__ - const ssef tnear_x = mini(tlower_x, tupper_x); - const ssef tnear_y = mini(tlower_y, tupper_y); - const ssef tnear_z = mini(tlower_z, tupper_z); - const ssef tfar_x = maxi(tlower_x, tupper_x); - const ssef tfar_y = maxi(tlower_y, tupper_y); - const ssef tfar_z = maxi(tlower_z, tupper_z); - const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - const sseb vmask = tnear <= tfar; - *dist = tnear; - return movemask(vmask); -#else - const ssef tnear_x = min(tlower_x, tupper_x); - const ssef tnear_y = min(tlower_y, tupper_y); - const ssef tnear_z = min(tlower_z, tupper_z); - const ssef tfar_x = max(tlower_x, tupper_x); - const ssef tfar_y = max(tlower_y, tupper_y); - const ssef tfar_z = max(tlower_z, tupper_z); - const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - const sseb vmask = tnear <= tfar; - *dist = tnear; - return movemask(vmask); -#endif -} - -/* Intersectors wrappers. - * - * They'll check node type and call appropriate intersection code. - */ - -ccl_device_inline int qbvh_node_intersect(KernelGlobals *ccl_restrict kg, - const ssef &isect_near, - const ssef &isect_far, -#ifdef __KERNEL_AVX2__ - const sse3f &org_idir, -#endif - const sse3f &org, - const sse3f &dir, - const sse3f &idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - ssef *ccl_restrict dist) -{ - const int offset = node_addr; - const float4 node = kernel_tex_fetch(__bvh_nodes, offset); - if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return qbvh_unaligned_node_intersect(kg, - isect_near, - isect_far, -#ifdef __KERNEL_AVX2__ - org_idir, -#endif - org, - dir, - idir, - near_x, - near_y, - near_z, - far_x, - far_y, - far_z, - node_addr, - dist); - } - else { - return qbvh_aligned_node_intersect(kg, - isect_near, - isect_far, -#ifdef __KERNEL_AVX2__ - org_idir, -#else - org, -#endif - idir, - near_x, - near_y, - near_z, - far_x, - far_y, - far_z, - node_addr, - dist); - } -} diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h deleted file mode 100644 index 682251bf25b..00000000000 --- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h +++ /dev/null @@ -1,453 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* This is a template BVH traversal function, where various features can be - * enabled/disabled. This way we can compile optimized versions for each case - * without new features slowing things down. - * - * BVH_INSTANCING: object instancing - * BVH_HAIR: hair curve rendering - * BVH_MOTION: motion blur rendering - */ - -#if BVH_FEATURE(BVH_HAIR) -# define NODE_INTERSECT qbvh_node_intersect -#else -# define NODE_INTERSECT qbvh_aligned_node_intersect -#endif - -ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect_array, - const uint visibility, - const uint max_hits, - uint *num_hits) -{ - /* TODO(sergey): - * - Test if pushing distance on the stack helps. - * - Likely and unlikely for if() statements. - * - Test restrict attribute for pointers. - */ - - /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - - /* Ray parameters in registers. */ - const float tmax = ray->t; - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - float isect_t = tmax; - -#if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; -#endif - - *num_hits = 0; - isect_array->t = tmax; - -#if BVH_FEATURE(BVH_INSTANCING) - int num_hits_in_instance = 0; -#endif - - ssef tnear(0.0f), tfar(isect_t); -#if BVH_FEATURE(BVH_HAIR) - sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); -#endif - sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); - -#ifdef __KERNEL_AVX2__ - float3 P_idir = P * idir; - sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); -#endif - - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); - (void)inodes; - - if (false -#ifdef __VISIBILITY_FLAG__ - || ((__float_as_uint(inodes.x) & visibility) == 0) -#endif -#if BVH_FEATURE(BVH_MOTION) - || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z) -#endif - ) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - ssef dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, -#ifdef __KERNEL_AVX2__ - P_idir4, -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, -#endif -#if BVH_FEATURE(BVH_HAIR) - dir4, -#endif - idir4, - near_x, - near_y, - near_z, - far_x, - far_y, - far_z, - node_addr, - &dist); - - if (child_mask != 0) { - float4 cnodes; -#if BVH_FEATURE(BVH_HAIR) - if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13); - } - else -#endif - { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if (child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float *)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float *)&dist)[r]; - if (child_mask == 0) { - if (d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float *)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - } - - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if (node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); -#ifdef __VISIBILITY_FLAG__ - if ((__float_as_uint(leaf.z) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } -#endif - - int prim_addr = __float_as_int(leaf.x); - -#if BVH_FEATURE(BVH_INSTANCING) - if (prim_addr >= 0) { -#endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - const uint p_type = type & PRIMITIVE_ALL; - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - - /* Primitive intersection. */ - while (prim_addr < prim_addr2) { - kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); - bool hit; - - /* todo: specialized intersect functions which don't fill in - * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW? - * might give a few % performance improvement */ - - switch (p_type) { - case PRIMITIVE_TRIANGLE: { - hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); - break; - } -#if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - hit = motion_triangle_intersect( - kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); - break; - } -#endif -#if BVH_FEATURE(BVH_HAIR) - case PRIMITIVE_CURVE: - case PRIMITIVE_MOTION_CURVE: { - const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); - if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = cardinal_curve_intersect(kg, - isect_array, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type); - } - else { - hit = curve_intersect(kg, - isect_array, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type); - } - break; - } -#endif - default: { - hit = false; - break; - } - } - - /* Shadow ray early termination. */ - if (hit) { - /* detect if this surface has a shader with transparent shadows */ - - /* todo: optimize so primitive visibility flag indicates if - * the primitive has a transparent shadow shader? */ - int prim = kernel_tex_fetch(__prim_index, isect_array->prim); - int shader = 0; - -#ifdef __HAIR__ - if (kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE) -#endif - { - shader = kernel_tex_fetch(__tri_shader, prim); - } -#ifdef __HAIR__ - else { - float4 str = kernel_tex_fetch(__curves, prim); - shader = __float_as_int(str.z); - } -#endif - int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; - - /* if no transparent shadows, all light is blocked */ - if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) { - return true; - } - /* if maximum number of hits reached, block all light */ - else if (*num_hits == max_hits) { - return true; - } - - /* move on to next entry in intersections array */ - isect_array++; - (*num_hits)++; -#if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; -#endif - - isect_array->t = isect_t; - } - - prim_addr++; - } - } -#if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr - 1); - -# if BVH_FEATURE(BVH_MOTION) - isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); -# else - isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); -# endif - - num_hits_in_instance = 0; - isect_array->t = isect_t; - - qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = ssef(isect_t); -# if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); -# endif - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); -# endif - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - - node_addr = kernel_tex_fetch(__object_node, object); - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - -#if BVH_FEATURE(BVH_INSTANCING) - if (stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); - - /* Instance pop. */ - if (num_hits_in_instance) { - float t_fac; -# if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); -# else - bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); -# endif - /* Scale isect->t to adjust for instancing. */ - for (int i = 0; i < num_hits_in_instance; i++) { - (isect_array - i - 1)->t *= t_fac; - } - } - else { -# if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); -# else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); -# endif - } - - isect_t = tmax; - isect_array->t = isect_t; - - qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = ssef(isect_t); -# if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); -# endif - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); -# endif - - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - - return false; -} - -#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h deleted file mode 100644 index f43e84bf368..00000000000 --- a/intern/cycles/kernel/bvh/qbvh_traversal.h +++ /dev/null @@ -1,420 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* This is a template BVH traversal function, where various features can be - * enabled/disabled. This way we can compile optimized versions for each case - * without new features slowing things down. - * - * BVH_INSTANCING: object instancing - * BVH_HAIR: hair curve rendering - * BVH_MOTION: motion blur rendering - */ - -#if BVH_FEATURE(BVH_HAIR) -# define NODE_INTERSECT qbvh_node_intersect -#else -# define NODE_INTERSECT qbvh_aligned_node_intersect -#endif - -ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect, - const uint visibility) -{ - /* TODO(sergey): - * - Test if pushing distance on the stack helps (for non shadow rays). - * - Separate version for shadow rays. - * - Likely and unlikely for if() statements. - * - Test restrict attribute for pointers. - */ - - /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - traversal_stack[0].dist = -FLT_MAX; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - float node_dist = -FLT_MAX; - - /* Ray parameters in registers. */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - -#if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; -#endif - - isect->t = ray->t; - isect->u = 0.0f; - isect->v = 0.0f; - isect->prim = PRIM_NONE; - isect->object = OBJECT_NONE; - - BVH_DEBUG_INIT(); - - ssef tnear(0.0f), tfar(ray->t); -#if BVH_FEATURE(BVH_HAIR) - sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); -#endif - sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); - -#ifdef __KERNEL_AVX2__ - float3 P_idir = P * idir; - sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - sse3f org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); -#endif - - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); - (void)inodes; - - if (UNLIKELY(node_dist > isect->t) -#if BVH_FEATURE(BVH_MOTION) - || UNLIKELY(ray->time < inodes.y) || UNLIKELY(ray->time > inodes.z) -#endif -#ifdef __VISIBILITY_FLAG__ - || (__float_as_uint(inodes.x) & visibility) == 0 -#endif - ) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - int child_mask; - ssef dist; - - BVH_DEBUG_NEXT_NODE(); - - { - child_mask = NODE_INTERSECT(kg, - tnear, - tfar, -#ifdef __KERNEL_AVX2__ - P_idir4, -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, -#endif -#if BVH_FEATURE(BVH_HAIR) - dir4, -#endif - idir4, - near_x, - near_y, - near_z, - far_x, - far_y, - far_z, - node_addr, - &dist); - } - - if (child_mask != 0) { - float4 cnodes; - /* TODO(sergey): Investigate whether moving cnodes upwards - * gives a speedup (will be different cache pattern but will - * avoid extra check here). - */ -#if BVH_FEATURE(BVH_HAIR) - if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13); - } - else -#endif - { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - float d0 = ((float *)&dist)[r]; - if (child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - node_dist = d0; - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float *)&dist)[r]; - if (child_mask == 0) { - if (d1 < d0) { - node_addr = c1; - node_dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - node_dist = d0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float *)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - } - - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if (node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); - -#ifdef __VISIBILITY_FLAG__ - if (UNLIKELY((node_dist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0))) -#else - if (UNLIKELY((node_dist > isect->t))) -#endif - { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - continue; - } - - int prim_addr = __float_as_int(leaf.x); - -#if BVH_FEATURE(BVH_INSTANCING) - if (prim_addr >= 0) { -#endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - - /* Primitive intersection. */ - switch (type & PRIMITIVE_ALL) { - case PRIMITIVE_TRIANGLE: { - for (; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_INTERSECTION(); - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) { - tfar = ssef(isect->t); - /* Shadow ray early termination. */ - if (visibility & PATH_RAY_SHADOW_OPAQUE) { - return true; - } - } - } - break; - } -#if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - for (; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_INTERSECTION(); - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - if (motion_triangle_intersect( - kg, isect, P, dir, ray->time, visibility, object, prim_addr)) { - tfar = ssef(isect->t); - /* Shadow ray early termination. */ - if (visibility & PATH_RAY_SHADOW_OPAQUE) { - return true; - } - } - } - break; - } -#endif /* BVH_FEATURE(BVH_MOTION) */ -#if BVH_FEATURE(BVH_HAIR) - case PRIMITIVE_CURVE: - case PRIMITIVE_MOTION_CURVE: { - for (; prim_addr < prim_addr2; prim_addr++) { - BVH_DEBUG_NEXT_INTERSECTION(); - const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); - kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); - bool hit; - if (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = cardinal_curve_intersect( - kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type); - } - else { - hit = curve_intersect( - kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type); - } - if (hit) { - tfar = ssef(isect->t); - /* Shadow ray early termination. */ - if (visibility & PATH_RAY_SHADOW_OPAQUE) { - return true; - } - } - } - break; - } -#endif /* BVH_FEATURE(BVH_HAIR) */ - } - } -#if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr - 1); - -# if BVH_FEATURE(BVH_MOTION) - qbvh_instance_motion_push( - kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist, &ob_itfm); -# else - qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist); -# endif - - qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = ssef(isect->t); -# if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); -# endif - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); -# endif - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - traversal_stack[stack_ptr].dist = -FLT_MAX; - - node_addr = kernel_tex_fetch(__object_node, object); - - BVH_DEBUG_NEXT_INSTANCE(); - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - -#if BVH_FEATURE(BVH_INSTANCING) - if (stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); - - /* Instance pop. */ -# if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); -# else - isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); -# endif - - qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = ssef(isect->t); -# if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); -# endif - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); -# endif - - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - node_dist = traversal_stack[stack_ptr].dist; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - - return (isect->prim != PRIM_NONE); -} - -#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h deleted file mode 100644 index e4eaed04467..00000000000 --- a/intern/cycles/kernel/bvh/qbvh_volume.h +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* This is a template BVH traversal function for volumes, where - * various features can be enabled/disabled. This way we can compile optimized - * versions for each case without new features slowing things down. - * - * BVH_INSTANCING: object instancing - * BVH_MOTION: motion blur rendering - */ - -#if BVH_FEATURE(BVH_HAIR) -# define NODE_INTERSECT qbvh_node_intersect -#else -# define NODE_INTERSECT qbvh_aligned_node_intersect -#endif - -ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect, - const uint visibility) -{ - /* TODO(sergey): - * - Test if pushing distance on the stack helps. - * - Likely and unlikely for if() statements. - * - Test restrict attribute for pointers. - */ - - /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - - /* Ray parameters in registers. */ - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - -#if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; -#endif - - isect->t = ray->t; - isect->u = 0.0f; - isect->v = 0.0f; - isect->prim = PRIM_NONE; - isect->object = OBJECT_NONE; - - ssef tnear(0.0f), tfar(ray->t); -#if BVH_FEATURE(BVH_HAIR) - sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); -#endif - sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); - -#ifdef __KERNEL_AVX2__ - float3 P_idir = P * idir; - sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); -#endif - - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); - -#ifdef __VISIBILITY_FLAG__ - if ((__float_as_uint(inodes.x) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } -#endif - - ssef dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, -#ifdef __KERNEL_AVX2__ - P_idir4, -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, -#endif -#if BVH_FEATURE(BVH_HAIR) - dir4, -#endif - idir4, - near_x, - near_y, - near_z, - far_x, - far_y, - far_z, - node_addr, - &dist); - - if (child_mask != 0) { - float4 cnodes; -#if BVH_FEATURE(BVH_HAIR) - if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13); - } - else -#endif - { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if (child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float *)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float *)&dist)[r]; - if (child_mask == 0) { - if (d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float *)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - } - - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if (node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); - - if ((__float_as_uint(leaf.z) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - int prim_addr = __float_as_int(leaf.x); - -#if BVH_FEATURE(BVH_INSTANCING) - if (prim_addr >= 0) { -#endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - const uint p_type = type & PRIMITIVE_ALL; - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - - /* Primitive intersection. */ - switch (p_type) { - case PRIMITIVE_TRIANGLE: { - for (; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE) ? - kernel_tex_fetch(__prim_object, prim_addr) : - object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr); - } - break; - } -#if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - for (; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE) ? - kernel_tex_fetch(__prim_object, prim_addr) : - object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - motion_triangle_intersect( - kg, isect, P, dir, ray->time, visibility, object, prim_addr); - } - break; - } -#endif - } - } -#if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr - 1); - int object_flag = kernel_tex_fetch(__object_flag, object); - if (object_flag & SD_OBJECT_HAS_VOLUME) { -# if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_push( - kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); -# else - isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); -# endif - - qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = ssef(isect->t); -# if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); -# endif - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); -# endif - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - - node_addr = kernel_tex_fetch(__object_node, object); - } - else { - /* Pop. */ - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - -#if BVH_FEATURE(BVH_INSTANCING) - if (stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); - - /* Instance pop. */ -# if BVH_FEATURE(BVH_MOTION) - isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); -# else - isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); -# endif - - qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = ssef(isect->t); -# if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); -# endif - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); -# endif - - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - - return (isect->prim != PRIM_NONE); -} - -#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h deleted file mode 100644 index eddc48c487e..00000000000 --- a/intern/cycles/kernel/bvh/qbvh_volume_all.h +++ /dev/null @@ -1,444 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* This is a template BVH traversal function for volumes, where - * various features can be enabled/disabled. This way we can compile optimized - * versions for each case without new features slowing things down. - * - * BVH_INSTANCING: object instancing - * BVH_MOTION: motion blur rendering - */ - -#if BVH_FEATURE(BVH_HAIR) -# define NODE_INTERSECT qbvh_node_intersect -#else -# define NODE_INTERSECT qbvh_aligned_node_intersect -#endif - -ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, - const Ray *ray, - Intersection *isect_array, - const uint max_hits, - const uint visibility) -{ - /* TODO(sergey): - * - Test if pushing distance on the stack helps. - * - Likely and unlikely for if() statements. - * - Test restrict attribute for pointers. - */ - - /* Traversal stack in CUDA thread-local memory. */ - QBVHStackItem traversal_stack[BVH_QSTACK_SIZE]; - traversal_stack[0].addr = ENTRYPOINT_SENTINEL; - - /* Traversal variables in registers. */ - int stack_ptr = 0; - int node_addr = kernel_data.bvh.root; - - /* Ray parameters in registers. */ - const float tmax = ray->t; - float3 P = ray->P; - float3 dir = bvh_clamp_direction(ray->D); - float3 idir = bvh_inverse_direction(dir); - int object = OBJECT_NONE; - float isect_t = tmax; - -#if BVH_FEATURE(BVH_MOTION) - Transform ob_itfm; -#endif - - uint num_hits = 0; - isect_array->t = tmax; - -#if BVH_FEATURE(BVH_INSTANCING) - int num_hits_in_instance = 0; -#endif - - ssef tnear(0.0f), tfar(isect_t); -#if BVH_FEATURE(BVH_HAIR) - sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); -#endif - sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z)); - -#ifdef __KERNEL_AVX2__ - float3 P_idir = P * idir; - sse3f P_idir4(P_idir.x, P_idir.y, P_idir.z); -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - sse3f org4(ssef(P.x), ssef(P.y), ssef(P.z)); -#endif - - /* Offsets to select the side that becomes the lower or upper bound. */ - int near_x, near_y, near_z; - int far_x, far_y, far_z; - qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - - /* Traversal loop. */ - do { - do { - /* Traverse internal nodes. */ - while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { - float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); - -#ifdef __VISIBILITY_FLAG__ - if ((__float_as_uint(inodes.x) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } -#endif - - ssef dist; - int child_mask = NODE_INTERSECT(kg, - tnear, - tfar, -#ifdef __KERNEL_AVX2__ - P_idir4, -#endif -#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4, -#endif -#if BVH_FEATURE(BVH_HAIR) - dir4, -#endif - idir4, - near_x, - near_y, - near_z, - far_x, - far_y, - far_z, - node_addr, - &dist); - - if (child_mask != 0) { - float4 cnodes; -#if BVH_FEATURE(BVH_HAIR) - if (__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 13); - } - else -#endif - { - cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 7); - } - - /* One child is hit, continue with that child. */ - int r = __bscf(child_mask); - if (child_mask == 0) { - node_addr = __float_as_int(cnodes[r]); - continue; - } - - /* Two children are hit, push far child, and continue with - * closer child. - */ - int c0 = __float_as_int(cnodes[r]); - float d0 = ((float *)&dist)[r]; - r = __bscf(child_mask); - int c1 = __float_as_int(cnodes[r]); - float d1 = ((float *)&dist)[r]; - if (child_mask == 0) { - if (d1 < d0) { - node_addr = c1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - continue; - } - else { - node_addr = c0; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - continue; - } - } - - /* Here starts the slow path for 3 or 4 hit children. We push - * all nodes onto the stack to sort them there. - */ - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c1; - traversal_stack[stack_ptr].dist = d1; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c0; - traversal_stack[stack_ptr].dist = d0; - - /* Three children are hit, push all onto stack and sort 3 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c2 = __float_as_int(cnodes[r]); - float d2 = ((float *)&dist)[r]; - if (child_mask == 0) { - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2]); - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - /* Four children are hit, push all onto stack and sort 4 - * stack items, continue with closest child. - */ - r = __bscf(child_mask); - int c3 = __float_as_int(cnodes[r]); - float d3 = ((float *)&dist)[r]; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c3; - traversal_stack[stack_ptr].dist = d3; - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = c2; - traversal_stack[stack_ptr].dist = d2; - qbvh_stack_sort(&traversal_stack[stack_ptr], - &traversal_stack[stack_ptr - 1], - &traversal_stack[stack_ptr - 2], - &traversal_stack[stack_ptr - 3]); - } - - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - - /* If node is leaf, fetch triangle list. */ - if (node_addr < 0) { - float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1)); - - if ((__float_as_uint(leaf.z) & visibility) == 0) { - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - continue; - } - - int prim_addr = __float_as_int(leaf.x); - -#if BVH_FEATURE(BVH_INSTANCING) - if (prim_addr >= 0) { -#endif - int prim_addr2 = __float_as_int(leaf.y); - const uint type = __float_as_int(leaf.w); - const uint p_type = type & PRIMITIVE_ALL; - bool hit; - - /* Pop. */ - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - - /* Primitive intersection. */ - switch (p_type) { - case PRIMITIVE_TRIANGLE: { - for (; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE) ? - kernel_tex_fetch(__prim_object, prim_addr) : - object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); - if (hit) { - /* Move on to next entry in intersections array. */ - isect_array++; - num_hits++; -#if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; -#endif - isect_array->t = isect_t; - if (num_hits == max_hits) { -#if BVH_FEATURE(BVH_INSTANCING) - if (object != OBJECT_NONE) { -# if BVH_FEATURE(BVH_MOTION) - float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); -# else - Transform itfm = object_fetch_transform( - kg, object, OBJECT_INVERSE_TRANSFORM); - float t_fac = 1.0f / len(transform_direction(&itfm, dir)); -# endif - for (int i = 0; i < num_hits_in_instance; i++) { - (isect_array - i - 1)->t *= t_fac; - } - } -#endif /* BVH_FEATURE(BVH_INSTANCING) */ - return num_hits; - } - } - } - break; - } -#if BVH_FEATURE(BVH_MOTION) - case PRIMITIVE_MOTION_TRIANGLE: { - for (; prim_addr < prim_addr2; prim_addr++) { - kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - /* Only primitives from volume object. */ - uint tri_object = (object == OBJECT_NONE) ? - kernel_tex_fetch(__prim_object, prim_addr) : - object; - int object_flag = kernel_tex_fetch(__object_flag, tri_object); - if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { - continue; - } - /* Intersect ray against primitive. */ - hit = motion_triangle_intersect( - kg, isect_array, P, dir, ray->time, visibility, object, prim_addr); - if (hit) { - /* Move on to next entry in intersections array. */ - isect_array++; - num_hits++; -# if BVH_FEATURE(BVH_INSTANCING) - num_hits_in_instance++; -# endif - isect_array->t = isect_t; - if (num_hits == max_hits) { -# if BVH_FEATURE(BVH_INSTANCING) - if (object != OBJECT_NONE) { -# if BVH_FEATURE(BVH_MOTION) - float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir)); -# else - Transform itfm = object_fetch_transform( - kg, object, OBJECT_INVERSE_TRANSFORM); - float t_fac = 1.0f / len(transform_direction(&itfm, dir)); -# endif - for (int i = 0; i < num_hits_in_instance; i++) { - (isect_array - i - 1)->t *= t_fac; - } - } -# endif /* BVH_FEATURE(BVH_INSTANCING) */ - return num_hits; - } - } - } - break; - } -#endif - } - } -#if BVH_FEATURE(BVH_INSTANCING) - else { - /* Instance push. */ - object = kernel_tex_fetch(__prim_object, -prim_addr - 1); - int object_flag = kernel_tex_fetch(__object_flag, object); - if (object_flag & SD_OBJECT_HAS_VOLUME) { -# if BVH_FEATURE(BVH_MOTION) - isect_t = bvh_instance_motion_push( - kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); -# else - isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); -# endif - - qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = ssef(isect_t); - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); -# if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); -# endif -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); -# endif - - num_hits_in_instance = 0; - isect_array->t = isect_t; - - ++stack_ptr; - kernel_assert(stack_ptr < BVH_QSTACK_SIZE); - traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; - - node_addr = kernel_tex_fetch(__object_node, object); - } - else { - /* Pop. */ - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } - } - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - -#if BVH_FEATURE(BVH_INSTANCING) - if (stack_ptr >= 0) { - kernel_assert(object != OBJECT_NONE); - - /* Instance pop. */ - if (num_hits_in_instance) { - float t_fac; -# if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm); -# else - bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); -# endif - /* Scale isect->t to adjust for instancing. */ - for (int i = 0; i < num_hits_in_instance; i++) { - (isect_array - i - 1)->t *= t_fac; - } - } - else { -# if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); -# else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); -# endif - } - - isect_t = tmax; - isect_array->t = isect_t; - - qbvh_near_far_idx_calc(idir, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - tfar = ssef(isect_t); -# if BVH_FEATURE(BVH_HAIR) - dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z)); -# endif - idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z)); -# ifdef __KERNEL_AVX2__ - P_idir = P * idir; - P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z); -# endif -# if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__) - org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); -# endif - - object = OBJECT_NONE; - node_addr = traversal_stack[stack_ptr].addr; - --stack_ptr; - } -#endif /* FEATURE(BVH_INSTANCING) */ - } while (node_addr != ENTRYPOINT_SENTINEL); - - return num_hits; -} - -#undef NODE_INTERSECT diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index 4cc61e8ee71..6070fd983f5 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -119,13 +119,16 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, differential3 *domega_in, float *pdf) { + /* For curves use the smooth normal, particularly for ribbons the geometric + * normal gives too much darkening otherwise. */ int label; + const float3 Ng = (sd->type & PRIMITIVE_ALL_CURVE) ? sc->N : sd->Ng; switch (sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: label = bsdf_diffuse_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -140,7 +143,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: label = bsdf_oren_nayar_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -155,7 +158,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, # ifdef __OSL__ case CLOSURE_BSDF_PHONG_RAMP_ID: label = bsdf_phong_ramp_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -169,7 +172,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, break; case CLOSURE_BSDF_DIFFUSE_RAMP_ID: label = bsdf_diffuse_ramp_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -184,7 +187,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, # endif case CLOSURE_BSDF_TRANSLUCENT_ID: label = bsdf_translucent_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -198,7 +201,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, break; case CLOSURE_BSDF_REFLECTION_ID: label = bsdf_reflection_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -212,7 +215,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, break; case CLOSURE_BSDF_REFRACTION_ID: label = bsdf_refraction_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -226,7 +229,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, break; case CLOSURE_BSDF_TRANSPARENT_ID: label = bsdf_transparent_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -244,7 +247,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: label = bsdf_microfacet_ggx_sample(kg, sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -260,7 +263,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: label = bsdf_microfacet_multi_ggx_sample(kg, sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -277,7 +280,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -294,7 +297,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: label = bsdf_microfacet_beckmann_sample(kg, sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -308,7 +311,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: label = bsdf_ashikhmin_shirley_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -322,7 +325,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: label = bsdf_ashikhmin_velvet_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -336,7 +339,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: label = bsdf_diffuse_toon_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -350,7 +353,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, break; case CLOSURE_BSDF_GLOSSY_TOON_ID: label = bsdf_glossy_toon_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -364,7 +367,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: label = bsdf_hair_reflection_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -378,7 +381,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: label = bsdf_hair_transmission_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -398,7 +401,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: label = bsdf_principled_diffuse_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -412,7 +415,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg, break; case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: label = bsdf_principled_sheen_sample(sc, - sd->Ng, + Ng, sd->I, sd->dI.dx, sd->dI.dy, @@ -485,9 +488,12 @@ ccl_device_inline const float3 omega_in, float *pdf) { + /* For curves use the smooth normal, particularly for ribbons the geometric + * normal gives too much darkening otherwise. */ + const float3 Ng = (sd->type & PRIMITIVE_ALL_CURVE) ? sd->N : sd->Ng; float3 eval; - if (dot(sd->Ng, omega_in) >= 0.0f) { + if (dot(Ng, omega_in) >= 0.0f) { switch (sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: diff --git a/intern/cycles/kernel/closure/bsdf_hair_principled.h b/intern/cycles/kernel/closure/bsdf_hair_principled.h index f78bbeb5d9d..389bd62ba68 100644 --- a/intern/cycles/kernel/closure/bsdf_hair_principled.h +++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h @@ -206,9 +206,6 @@ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bs float3 X = safe_normalize(sd->dPdu); float3 Y = safe_normalize(cross(X, sd->I)); float3 Z = safe_normalize(cross(X, Y)); - /* TODO: the solution below works where sd->Ng is the normal - * pointing from the center of the curve to the shading point. - * It doesn't work for triangles, see https://developer.blender.org/T43625 */ /* h -1..0..1 means the rays goes from grazing the hair, to hitting it at * the center, to grazing the other edge. This is the sine of the angle @@ -216,7 +213,9 @@ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bs /* TODO: we convert this value to a cosine later and discard the sign, so * we could probably save some operations. */ - float h = dot(cross(sd->Ng, X), Z); + float h = (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) ? + -sd->v : + dot(cross(sd->Ng, X), Z); kernel_assert(fabsf(h) < 1.0f + 1e-4f); kernel_assert(isfinite3_safe(Y)); diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index 928cad58452..6ff0c7f2044 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -23,33 +23,6 @@ CCL_NAMESPACE_BEGIN #ifdef __HAIR__ -/* Interpolation of curve geometry */ - -ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3) -{ - float fc = 0.71f; - float data[4]; - float t2 = t * t; - data[0] = -3.0f * fc * t2 + 4.0f * fc * t - fc; - data[1] = 3.0f * (2.0f - fc) * t2 + 2.0f * (fc - 3.0f) * t; - data[2] = 3.0f * (fc - 2.0f) * t2 + 2.0f * (3.0f - 2.0f * fc) * t + fc; - data[3] = 3.0f * fc * t2 - 2.0f * fc * t; - return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; -} - -ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3) -{ - float data[4]; - float fc = 0.71f; - float t2 = t * t; - float t3 = t2 * t; - data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t; - data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f; - data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t; - data[3] = fc * t3 - fc * t2; - return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; -} - /* Reading attributes on various curve elements */ ccl_device float curve_attribute_float( @@ -225,6 +198,66 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, } } +ccl_device float4 curve_attribute_float4(KernelGlobals *kg, + const ShaderData *sd, + const AttributeDescriptor desc, + float4 *dx, + float4 *dy) +{ + if (desc.element == ATTR_ELEMENT_CURVE) { + /* idea: we can't derive any useful differentials here, but for tiled + * mipmap image caching it would be useful to avoid reading the highest + * detail level always. maybe a derivative based on the hair density + * could be computed somehow? */ +# ifdef __RAY_DIFFERENTIALS__ + if (dx) + *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + if (dy) + *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f); +# endif + + return kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim); + } + else if (desc.element == ATTR_ELEMENT_CURVE_KEY || + desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) { + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); + int k1 = k0 + 1; + + float4 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0); + float4 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1); + +# ifdef __RAY_DIFFERENTIALS__ + if (dx) + *dx = sd->du.dx * (f1 - f0); + if (dy) + *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f); +# endif + + return (1.0f - sd->u) * f0 + sd->u * f1; + } + else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) { +# ifdef __RAY_DIFFERENTIALS__ + if (dx) + *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + if (dy) + *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f); +# endif + + return kernel_tex_fetch(__attributes_float3, desc.offset); + } + else { +# ifdef __RAY_DIFFERENTIALS__ + if (dx) + *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + if (dy) + *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f); +# endif + + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } +} + /* Curve thickness */ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) @@ -238,12 +271,12 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) float4 P_curve[2]; - if (sd->type & PRIMITIVE_CURVE) { + if (!(sd->type & PRIMITIVE_ALL_MOTION)) { P_curve[0] = kernel_tex_fetch(__curve_keys, k0); P_curve[1] = kernel_tex_fetch(__curve_keys, k1); } else { - motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); + motion_curve_keys_linear(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); } r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w; diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h index 7a770470150..c04dbee52cc 100644 --- a/intern/cycles/kernel/geom/geom_curve_intersect.h +++ b/intern/cycles/kernel/geom/geom_curve_intersect.h @@ -1,4 +1,7 @@ /* + * Copyright 2009-2020 Intel Corporation. Adapted from Embree with + * with modifications. + * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -14,501 +17,620 @@ CCL_NAMESPACE_BEGIN -/* Curve primitive intersection functions. */ +/* Curve primitive intersection functions. + * + * The code here was adapted from curve_intersector_sweep.h in Embree, to get + * an exact match between Embree CPU ray-tracing and our GPU ray-tracing. */ + +#define CURVE_NUM_BEZIER_SUBDIVISIONS 3 +#define CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE (CURVE_NUM_BEZIER_SUBDIVISIONS + 1) +#define CURVE_NUM_BEZIER_STEPS 2 +#define CURVE_NUM_JACOBIAN_ITERATIONS 5 #ifdef __HAIR__ -# ifdef __KERNEL_SSE2__ -ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a) +/* Catmull-rom curve evaluation. */ + +ccl_device_inline float4 catmull_rom_basis_eval(const float4 curve[4], float u) { - return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2])); + const float t = u; + const float s = 1.0f - u; + const float n0 = -t * s * s; + const float n1 = 2.0f + t * t * (3.0f * t - 5.0f); + const float n2 = 2.0f + s * s * (3.0f * s - 5.0f); + const float n3 = -s * t * t; + return 0.5f * (curve[0] * n0 + curve[1] * n1 + curve[2] * n2 + curve[3] * n3); } -# endif -/* On CPU pass P and dir by reference to aligned vector. */ -ccl_device_forceinline bool cardinal_curve_intersect(KernelGlobals *kg, - Intersection *isect, - const float3 ccl_ref P, - const float3 ccl_ref dir, - uint visibility, - int object, - int curveAddr, - float time, - int type) +ccl_device_inline float4 catmull_rom_basis_derivative(const float4 curve[4], float u) { - const bool is_curve_primitive = (type & PRIMITIVE_CURVE); + const float t = u; + const float s = 1.0f - u; + const float n0 = -s * s + 2.0f * s * t; + const float n1 = 2.0f * t * (3.0f * t - 5.0f) + 3.0f * t * t; + const float n2 = 2.0f * s * (3.0f * t + 2.0f) - 3.0f * s * s; + const float n3 = -2.0f * s * t + t * t; + return 0.5f * (curve[0] * n0 + curve[1] * n1 + curve[2] * n2 + curve[3] * n3); +} -# ifndef __KERNEL_OPTIX__ /* see OptiX motion flag OPTIX_MOTION_FLAG_[START|END]_VANISH */ - if (!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { - const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); - if (time < prim_time.x || time > prim_time.y) { - return false; - } - } -# endif +ccl_device_inline float4 catmull_rom_basis_derivative2(const float4 curve[4], float u) +{ - int segment = PRIMITIVE_UNPACK_SEGMENT(type); - float epsilon = 0.0f; - float r_st, r_en; + const float t = u; + const float n0 = -3.0f * t + 2.0f; + const float n1 = 9.0f * t - 5.0f; + const float n2 = -9.0f * t + 4.0f; + const float n3 = 3.0f * t - 1.0f; + return (curve[0] * n0 + curve[1] * n1 + curve[2] * n2 + curve[3] * n3); +} - int depth = kernel_data.curve.subdivisions; - int flags = kernel_data.curve.curveflags; - int prim = kernel_tex_fetch(__prim_index, curveAddr); +/* Thick Curve */ -# ifdef __KERNEL_SSE2__ - ssef vdir = load4f(dir); - ssef vcurve_coef[4]; - const float3 *curve_coef = (float3 *)vcurve_coef; +ccl_device_inline float3 dnormalize(const float3 p, const float3 dp) +{ + const float pp = dot(p, p); + const float pdp = dot(p, dp); + return (pp * dp - pdp * p) / (pp * sqrtf(pp)); +} - { - ssef dtmp = vdir * vdir; - ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp)); - ssef rd_ss = load1f_first(1.0f) / d_ss; - - ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]); - int2 &v00 = (int2 &)v00vec; - - int k0 = v00.x + segment; - int k1 = k0 + 1; - int ka = max(k0 - 1, v00.x); - int kb = min(k1 + 1, v00.x + v00.y - 1); - -# if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && \ - (!defined(_MSC_VER) || _MSC_VER > 1800) - avxf P_curve_0_1, P_curve_2_3; - if (is_curve_primitive) { - P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x); - P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x); - } - else { - int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object; - motion_cardinal_curve_keys_avx( - kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1, &P_curve_2_3); - } -# else /* __KERNEL_AVX2__ */ - ssef P_curve[4]; - - if (is_curve_primitive) { - P_curve[0] = load4f(&kg->__curve_keys.data[ka].x); - P_curve[1] = load4f(&kg->__curve_keys.data[k0].x); - P_curve[2] = load4f(&kg->__curve_keys.data[k1].x); - P_curve[3] = load4f(&kg->__curve_keys.data[kb].x); +ccl_device_inline float sqr_point_to_line_distance(const float3 PmQ0, const float3 Q1mQ0) +{ + const float3 N = cross(PmQ0, Q1mQ0); + const float3 D = Q1mQ0; + return dot(N, N) / dot(D, D); +} + +ccl_device_inline bool cylinder_intersect(const float3 cylinder_start, + const float3 cylinder_end, + const float cylinder_radius, + const float3 ray_dir, + float2 *t_o, + float *u0_o, + float3 *Ng0_o, + float *u1_o, + float3 *Ng1_o) +{ + /* Calculate quadratic equation to solve. */ + const float rl = 1.0f / len(cylinder_end - cylinder_start); + const float3 P0 = cylinder_start, dP = (cylinder_end - cylinder_start) * rl; + const float3 O = -P0, dO = ray_dir; + + const float dOdO = dot(dO, dO); + const float OdO = dot(dO, O); + const float OO = dot(O, O); + const float dOz = dot(dP, dO); + const float Oz = dot(dP, O); + + const float A = dOdO - sqr(dOz); + const float B = 2.0f * (OdO - dOz * Oz); + const float C = OO - sqr(Oz) - sqr(cylinder_radius); + + /* We miss the cylinder if determinant is smaller than zero. */ + const float D = B * B - 4.0f * A * C; + if (!(D >= 0.0f)) { + *t_o = make_float2(FLT_MAX, -FLT_MAX); + return false; + } + + /* Special case for rays that are parallel to the cylinder. */ + const float eps = 16.0f * FLT_EPSILON * max(fabsf(dOdO), fabsf(sqr(dOz))); + if (fabsf(A) < eps) { + if (C <= 0.0f) { + *t_o = make_float2(-FLT_MAX, FLT_MAX); + return true; } else { - int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object; - motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4 *)&P_curve); + *t_o = make_float2(-FLT_MAX, FLT_MAX); + return false; } -# endif /* __KERNEL_AVX2__ */ - - ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss)); - ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn; - ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy; - ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz); - ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)); - - ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0); - ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0); - ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); - -# if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && \ - (!defined(_MSC_VER) || _MSC_VER > 1800) - const avxf vPP = _mm256_broadcast_ps(&P.m128); - const avxf htfm00 = avxf(htfm0.m128, htfm0.m128); - const avxf htfm11 = avxf(htfm1.m128, htfm1.m128); - const avxf htfm22 = avxf(htfm2.m128, htfm2.m128); - - const avxf p01 = madd( - shuffle<0>(P_curve_0_1 - vPP), - htfm00, - madd(shuffle<1>(P_curve_0_1 - vPP), htfm11, shuffle<2>(P_curve_0_1 - vPP) * htfm22)); - const avxf p23 = madd( - shuffle<0>(P_curve_2_3 - vPP), - htfm00, - madd(shuffle<1>(P_curve_2_3 - vPP), htfm11, shuffle<2>(P_curve_2_3 - vPP) * htfm22)); - - const ssef p0 = _mm256_castps256_ps128(p01); - const ssef p1 = _mm256_extractf128_ps(p01, 1); - const ssef p2 = _mm256_castps256_ps128(p23); - const ssef p3 = _mm256_extractf128_ps(p23, 1); - - const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1); - r_st = ((float4 &)P_curve_1).w; - const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3); - r_en = ((float4 &)P_curve_2).w; -# else /* __KERNEL_AVX2__ */ - ssef htfm[] = {htfm0, htfm1, htfm2}; - ssef vP = load4f(P); - ssef p0 = transform_point_T3(htfm, P_curve[0] - vP); - ssef p1 = transform_point_T3(htfm, P_curve[1] - vP); - ssef p2 = transform_point_T3(htfm, P_curve[2] - vP); - ssef p3 = transform_point_T3(htfm, P_curve[3] - vP); - - r_st = ((float4 &)P_curve[1]).w; - r_en = ((float4 &)P_curve[2]).w; -# endif /* __KERNEL_AVX2__ */ - - float fc = 0.71f; - ssef vfc = ssef(fc); - ssef vfcxp3 = vfc * p3; - - vcurve_coef[0] = p1; - vcurve_coef[1] = vfc * (p2 - p0); - vcurve_coef[2] = madd( - ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3))); - vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3)); } -# else - float3 curve_coef[4]; - /* curve Intersection check */ - /* obtain curve parameters */ + /* Standard case for rays that are not parallel to the cylinder. */ + const float Q = sqrtf(D); + const float rcp_2A = 1.0f / (2.0f * A); + const float t0 = (-B - Q) * rcp_2A; + const float t1 = (-B + Q) * rcp_2A; + + /* Calculates u and Ng for near hit. */ { - /* ray transform created - this should be created at beginning of intersection loop */ - Transform htfm; - float d = sqrtf(dir.x * dir.x + dir.z * dir.z); - htfm = make_transform(dir.z / d, - 0, - -dir.x / d, - 0, - -dir.x * dir.y / d, - d, - -dir.y * dir.z / d, - 0, - dir.x, - dir.y, - dir.z, - 0); - - float4 v00 = kernel_tex_fetch(__curves, prim); - - int k0 = __float_as_int(v00.x) + segment; - int k1 = k0 + 1; - - int ka = max(k0 - 1, __float_as_int(v00.x)); - int kb = min(k1 + 1, __float_as_int(v00.x) + __float_as_int(v00.y) - 1); - - float4 P_curve[4]; - - if (is_curve_primitive) { - P_curve[0] = kernel_tex_fetch(__curve_keys, ka); - P_curve[1] = kernel_tex_fetch(__curve_keys, k0); - P_curve[2] = kernel_tex_fetch(__curve_keys, k1); - P_curve[3] = kernel_tex_fetch(__curve_keys, kb); - } - else { - int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object; - motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve); - } + *u0_o = (t0 * dOz + Oz) * rl; + const float3 Pr = t0 * ray_dir; + const float3 Pl = (*u0_o) * (cylinder_end - cylinder_start) + cylinder_start; + *Ng0_o = Pr - Pl; + } - float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P); - float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P); - float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P); - float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P); - - float fc = 0.71f; - curve_coef[0] = p1; - curve_coef[1] = -fc * p0 + fc * p2; - curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3; - curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3; - r_st = P_curve[1].w; - r_en = P_curve[2].w; + /* Calculates u and Ng for far hit. */ + { + *u1_o = (t1 * dOz + Oz) * rl; + const float3 Pr = t1 * ray_dir; + const float3 Pl = (*u1_o) * (cylinder_end - cylinder_start) + cylinder_start; + *Ng1_o = Pr - Pl; } -# endif - float r_curr = max(r_st, r_en); - - if ((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING)) - epsilon = 2 * r_curr; - - /* find bounds - this is slow for cubic curves */ - float upper, lower; - - float zextrem[4]; - curvebounds(&lower, - &upper, - &zextrem[0], - &zextrem[1], - &zextrem[2], - &zextrem[3], - curve_coef[0].z, - curve_coef[1].z, - curve_coef[2].z, - curve_coef[3].z); - if (lower - r_curr > isect->t || upper + r_curr < epsilon) - return false; + *t_o = make_float2(t0, t1); - /* minimum width extension */ - float xextrem[4]; - curvebounds(&lower, - &upper, - &xextrem[0], - &xextrem[1], - &xextrem[2], - &xextrem[3], - curve_coef[0].x, - curve_coef[1].x, - curve_coef[2].x, - curve_coef[3].x); - if (lower > r_curr || upper < -r_curr) - return false; + return true; +} - float yextrem[4]; - curvebounds(&lower, - &upper, - &yextrem[0], - &yextrem[1], - &yextrem[2], - &yextrem[3], - curve_coef[0].y, - curve_coef[1].y, - curve_coef[2].y, - curve_coef[3].y); - if (lower > r_curr || upper < -r_curr) - return false; +ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, const float3 ray_dir) +{ + const float3 O = -P; + const float3 D = ray_dir; + const float ON = dot(O, N); + const float DN = dot(D, N); + const float min_rcp_input = 1e-18f; + const bool eps = fabsf(DN) < min_rcp_input; + const float t = -ON / DN; + const float lower = (eps || DN < 0.0f) ? -FLT_MAX : t; + const float upper = (eps || DN > 0.0f) ? FLT_MAX : t; + return make_float2(lower, upper); +} - /* setup recurrent loop */ - int level = 1 << depth; - int tree = 0; - float resol = 1.0f / (float)level; - bool hit = false; - - /* begin loop */ - while (!(tree >> (depth))) { - const float i_st = tree * resol; - const float i_en = i_st + (level * resol); - -# ifdef __KERNEL_SSE2__ - ssef vi_st = ssef(i_st), vi_en = ssef(i_en); - ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), - vi_st, - vcurve_coef[0]); - ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), - vi_en, - vcurve_coef[0]); - - ssef vbmin = min(vp_st, vp_en); - ssef vbmax = max(vp_st, vp_en); - - float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax; - float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z; - float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z; - float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en; -# else - float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + - curve_coef[0]; - float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + - curve_coef[0]; - - float bminx = min(p_st.x, p_en.x); - float bmaxx = max(p_st.x, p_en.x); - float bminy = min(p_st.y, p_en.y); - float bmaxy = max(p_st.y, p_en.y); - float bminz = min(p_st.z, p_en.z); - float bmaxz = max(p_st.z, p_en.z); -# endif +ccl_device bool curve_intersect_iterative(const float3 ray_dir, + const float dt, + const float4 curve[4], + float u, + float t, + const bool use_backfacing, + Intersection *isect) +{ + const float length_ray_dir = len(ray_dir); + + /* Error of curve evaluations is proportional to largest coordinate. */ + const float4 box_min = min(min(curve[0], curve[1]), min(curve[2], curve[3])); + const float4 box_max = max(min(curve[0], curve[1]), max(curve[2], curve[3])); + const float4 box_abs = max(fabs(box_min), fabs(box_max)); + const float P_err = 16.0f * FLT_EPSILON * + max(box_abs.x, max(box_abs.y, max(box_abs.z, box_abs.w))); + const float radius_max = box_max.w; + + for (int i = 0; i < CURVE_NUM_JACOBIAN_ITERATIONS; i++) { + const float3 Q = ray_dir * t; + const float3 dQdt = ray_dir; + const float Q_err = 16.0f * FLT_EPSILON * length_ray_dir * t; + + const float4 P4 = catmull_rom_basis_eval(curve, u); + const float4 dPdu4 = catmull_rom_basis_derivative(curve, u); + + const float3 P = float4_to_float3(P4); + const float3 dPdu = float4_to_float3(dPdu4); + const float radius = P4.w; + const float dradiusdu = dPdu4.w; + + const float3 ddPdu = float4_to_float3(catmull_rom_basis_derivative2(curve, u)); + + const float3 R = Q - P; + const float len_R = len(R); + const float R_err = max(Q_err, P_err); + const float3 dRdu = -dPdu; + const float3 dRdt = dQdt; + + const float3 T = normalize(dPdu); + const float3 dTdu = dnormalize(dPdu, ddPdu); + const float cos_err = P_err / len(dPdu); + + const float f = dot(R, T); + const float f_err = len_R * P_err + R_err + cos_err * (1.0f + len_R); + const float dfdu = dot(dRdu, T) + dot(R, dTdu); + const float dfdt = dot(dRdt, T); + + const float K = dot(R, R) - sqr(f); + const float dKdu = (dot(R, dRdu) - f * dfdu); + const float dKdt = (dot(R, dRdt) - f * dfdt); + const float rsqrt_K = inversesqrtf(K); + + const float g = sqrtf(K) - radius; + const float g_err = R_err + f_err + 16.0f * FLT_EPSILON * radius_max; + const float dgdu = dKdu * rsqrt_K - dradiusdu; + const float dgdt = dKdt * rsqrt_K; + + const float invdet = 1.0f / (dfdu * dgdt - dgdu * dfdt); + u -= (dgdt * f - dfdt * g) * invdet; + t -= (-dgdu * f + dfdu * g) * invdet; + + if (fabsf(f) < f_err && fabsf(g) < g_err) { + t += dt; + if (!(0.0f <= t && t <= isect->t)) { + return false; /* Rejects NaNs */ + } + if (!(u >= 0.0f && u <= 1.0f)) { + return false; /* Rejects NaNs */ + } - if (xextrem[0] >= i_st && xextrem[0] <= i_en) { - bminx = min(bminx, xextrem[1]); - bmaxx = max(bmaxx, xextrem[1]); - } - if (xextrem[2] >= i_st && xextrem[2] <= i_en) { - bminx = min(bminx, xextrem[3]); - bmaxx = max(bmaxx, xextrem[3]); - } - if (yextrem[0] >= i_st && yextrem[0] <= i_en) { - bminy = min(bminy, yextrem[1]); - bmaxy = max(bmaxy, yextrem[1]); - } - if (yextrem[2] >= i_st && yextrem[2] <= i_en) { - bminy = min(bminy, yextrem[3]); - bmaxy = max(bmaxy, yextrem[3]); - } - if (zextrem[0] >= i_st && zextrem[0] <= i_en) { - bminz = min(bminz, zextrem[1]); - bmaxz = max(bmaxz, zextrem[1]); - } - if (zextrem[2] >= i_st && zextrem[2] <= i_en) { - bminz = min(bminz, zextrem[3]); - bmaxz = max(bmaxz, zextrem[3]); - } + /* Backface culling. */ + const float3 R = normalize(Q - P); + const float3 U = dradiusdu * R + dPdu; + const float3 V = cross(dPdu, R); + const float3 Ng = cross(V, U); + if (!use_backfacing && dot(ray_dir, Ng) > 0.0f) { + return false; + } - float r1 = r_st + (r_en - r_st) * i_st; - float r2 = r_st + (r_en - r_st) * i_en; - r_curr = max(r1, r2); + /* Record intersection. */ + isect->t = t; + isect->u = u; + isect->v = 0.0f; - if (bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_curr || - bmaxx < -r_curr || bminy > r_curr || bmaxy < -r_curr) { - /* the bounding box does not overlap the square centered at O */ - tree += level; - level = tree & -tree; + return true; } - else if (level == 1) { - - /* the maximum recursion depth is reached. - * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. - * dP* is reversed if necessary.*/ - float t = isect->t; - float u = 0.0f; - float gd = 0.0f; - - if (flags & CURVE_KN_RIBBONS) { - float3 tg = (p_en - p_st); -# ifdef __KERNEL_SSE__ - const float3 tg_sq = tg * tg; - float w = tg_sq.x + tg_sq.y; -# else - float w = tg.x * tg.x + tg.y * tg.y; -# endif - if (w == 0) { - tree++; - level = tree & -tree; - continue; - } -# ifdef __KERNEL_SSE__ - const float3 p_sttg = p_st * tg; - w = -(p_sttg.x + p_sttg.y) / w; + } + return false; +} + +ccl_device bool curve_intersect_recursive(const float3 ray_orig, + const float3 ray_dir, + float4 curve[4], + Intersection *isect) +{ + /* Move ray closer to make intersection stable. */ + const float3 center = float4_to_float3(0.25f * (curve[0] + curve[1] + curve[2] + curve[3])); + const float dt = dot(center - ray_orig, ray_dir) / dot(ray_dir, ray_dir); + const float3 ref = ray_orig + ray_dir * dt; + const float4 ref4 = make_float4(ref.x, ref.y, ref.z, 0.0f); + curve[0] -= ref4; + curve[1] -= ref4; + curve[2] -= ref4; + curve[3] -= ref4; + + const bool use_backfacing = false; + const float step_size = 1.0f / (float)(CURVE_NUM_BEZIER_STEPS); + + int depth = 0; + + /* todo: optimize stack for GPU somehow? Possibly some bitflags are enough, and + * u0/u1 can be derived from the depth. */ + struct { + float u0, u1; + int i; + } stack[CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE]; + + bool found = false; + + float u0 = 0.0f; + float u1 = 1.0f; + int i = 0; + + while (1) { + for (; i < CURVE_NUM_BEZIER_STEPS; i++) { + const float step = i * step_size; + + /* Subdivide curve. */ + const float dscale = (u1 - u0) * (1.0f / 3.0f) * step_size; + const float vu0 = mix(u0, u1, step); + const float vu1 = mix(u0, u1, step + step_size); + + const float4 P0 = catmull_rom_basis_eval(curve, vu0); + const float4 dP0du = dscale * catmull_rom_basis_derivative(curve, vu0); + const float4 P3 = catmull_rom_basis_eval(curve, vu1); + const float4 dP3du = dscale * catmull_rom_basis_derivative(curve, vu1); + + const float4 P1 = P0 + dP0du; + const float4 P2 = P3 - dP3du; + + /* Calculate bounding cylinders. */ + const float rr1 = sqr_point_to_line_distance(float4_to_float3(dP0du), + float4_to_float3(P3 - P0)); + const float rr2 = sqr_point_to_line_distance(float4_to_float3(dP3du), + float4_to_float3(P3 - P0)); + const float maxr12 = sqrtf(max(rr1, rr2)); + const float one_plus_ulp = 1.0f + 2.0f * FLT_EPSILON; + const float one_minus_ulp = 1.0f - 2.0f * FLT_EPSILON; + float r_outer = max(max(P0.w, P1.w), max(P2.w, P3.w)) + maxr12; + float r_inner = min(min(P0.w, P1.w), min(P2.w, P3.w)) - maxr12; + r_outer = one_plus_ulp * r_outer; + r_inner = max(0.0f, one_minus_ulp * r_inner); + bool valid = true; + + /* Intersect with outer cylinder. */ + float2 tc_outer; + float u_outer0, u_outer1; + float3 Ng_outer0, Ng_outer1; + valid = cylinder_intersect(float4_to_float3(P0), + float4_to_float3(P3), + r_outer, + ray_dir, + &tc_outer, + &u_outer0, + &Ng_outer0, + &u_outer1, + &Ng_outer1); + if (!valid) { + continue; + } + + /* Intersect with cap-planes. */ + float2 tp = make_float2(-dt, isect->t - dt); + tp = make_float2(max(tp.x, tc_outer.x), min(tp.y, tc_outer.y)); + const float2 h0 = half_plane_intersect( + float4_to_float3(P0), float4_to_float3(dP0du), ray_dir); + tp = make_float2(max(tp.x, h0.x), min(tp.y, h0.y)); + const float2 h1 = half_plane_intersect( + float4_to_float3(P3), -float4_to_float3(dP3du), ray_dir); + tp = make_float2(max(tp.x, h1.x), min(tp.y, h1.y)); + valid = tp.x <= tp.y; + if (!valid) { + continue; + } + + /* Clamp and correct u parameter. */ + u_outer0 = clamp(u_outer0, 0.0f, 1.0f); + u_outer1 = clamp(u_outer1, 0.0f, 1.0f); + u_outer0 = mix(u0, u1, (step + u_outer0) * (1.0f / (float)(CURVE_NUM_BEZIER_STEPS + 1))); + u_outer1 = mix(u0, u1, (step + u_outer1) * (1.0f / (float)(CURVE_NUM_BEZIER_STEPS + 1))); + + /* Intersect with inner cylinder. */ + float2 tc_inner; + float u_inner0, u_inner1; + float3 Ng_inner0, Ng_inner1; + const bool valid_inner = cylinder_intersect(float4_to_float3(P0), + float4_to_float3(P3), + r_inner, + ray_dir, + &tc_inner, + &u_inner0, + &Ng_inner0, + &u_inner1, + &Ng_inner1); + + /* At the unstable area we subdivide deeper. */ +# if 0 + const bool unstable0 = (!valid_inner) | + (fabsf(dot(normalize(ray_dir), normalize(Ng_inner0))) < 0.3f); + const bool unstable1 = (!valid_inner) | + (fabsf(dot(normalize(ray_dir), normalize(Ng_inner1))) < 0.3f); # else - w = -(p_st.x * tg.x + p_st.y * tg.y) / w; -# endif - w = saturate(w); - - /* compute u on the curve segment */ - u = i_st * (1 - w) + i_en * w; - r_curr = r_st + (r_en - r_st) * u; - /* compare x-y distances */ - float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + - curve_coef[0]; - - float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; - if (dot(tg, dp_st) < 0) - dp_st *= -1; - if (dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) { - tree++; - level = tree & -tree; - continue; - } - float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; - if (dot(tg, dp_en) < 0) - dp_en *= -1; - if (dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) { - tree++; - level = tree & -tree; - continue; - } + /* On the GPU appears to be a little faster if always enabled. */ + (void)valid_inner; - if (p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_curr * r_curr || p_curr.z <= epsilon || - isect->t < p_curr.z) { - tree++; - level = tree & -tree; - continue; - } + const bool unstable0 = true; + const bool unstable1 = true; +# endif - t = p_curr.z; + /* Subtract the inner interval from the current hit interval. */ + float2 tp0 = make_float2(tp.x, min(tp.y, tc_inner.x)); + float2 tp1 = make_float2(max(tp.x, tc_inner.y), tp.y); + bool valid0 = valid && (tp0.x <= tp0.y); + bool valid1 = valid && (tp1.x <= tp1.y); + if (!(valid0 || valid1)) { + continue; } - else { - float l = len(p_en - p_st); - float invl = 1.0f / l; - float3 tg = (p_en - p_st) * invl; - gd = (r2 - r1) * invl; - float difz = -dot(p_st, tg); - float cyla = 1.0f - (tg.z * tg.z * (1 + gd * gd)); - float invcyla = 1.0f / cyla; - float halfb = (-p_st.z - tg.z * (difz + gd * (difz * gd + r1))); - float tcentre = -halfb * invcyla; - float zcentre = difz + (tg.z * tcentre); - float3 tdif = -p_st; - tdif.z += tcentre; - float tdifz = dot(tdif, tg); - float tb = 2 * (tdif.z - tg.z * (tdifz + gd * (tdifz * gd + r1))); - float tc = dot(tdif, tdif) - tdifz * tdifz * (1 + gd * gd) - r1 * r1 - 2 * r1 * tdifz * gd; - float td = tb * tb - 4 * cyla * tc; - if (td < 0.0f) { - tree++; - level = tree & -tree; - continue; - } - float rootd = sqrtf(td); - float correction = (-tb - rootd) * 0.5f * invcyla; - t = tcentre + correction; - - float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; - if (dot(tg, dp_st) < 0) - dp_st *= -1; - float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; - if (dot(tg, dp_en) < 0) - dp_en *= -1; - - if (flags & CURVE_KN_BACKFACING && - (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || - isect->t < t || t <= 0.0f)) { - correction = (-tb + rootd) * 0.5f * invcyla; - t = tcentre + correction; + /* Process one or two hits. */ + bool recurse = false; + if (valid0) { + const int termDepth = unstable0 ? CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE : + CURVE_NUM_BEZIER_SUBDIVISIONS; + if (depth >= termDepth) { + found |= curve_intersect_iterative( + ray_dir, dt, curve, u_outer0, tp0.x, use_backfacing, isect); } - - if (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || - isect->t < t || t <= 0.0f) { - tree++; - level = tree & -tree; - continue; + else { + recurse = true; } + } - float w = (zcentre + (tg.z * correction)) * invl; - w = saturate(w); - /* compute u on the curve segment */ - u = i_st * (1 - w) + i_en * w; + if (valid1 && (tp1.x + dt <= isect->t)) { + const int termDepth = unstable1 ? CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE : + CURVE_NUM_BEZIER_SUBDIVISIONS; + if (depth >= termDepth) { + found |= curve_intersect_iterative( + ray_dir, dt, curve, u_outer1, tp1.y, use_backfacing, isect); + } + else { + recurse = true; + } } - /* we found a new intersection */ -# ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if (kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) -# endif - { - /* record intersection */ - isect->t = t; - isect->u = u; - isect->v = gd; - isect->prim = curveAddr; - isect->object = object; - isect->type = type; - hit = true; + if (recurse) { + stack[depth].u0 = u0; + stack[depth].u1 = u1; + stack[depth].i = i + 1; + depth++; + + u0 = vu0; + u1 = vu1; + i = -1; } + } - tree++; - level = tree & -tree; + if (depth > 0) { + depth--; + u0 = stack[depth].u0; + u1 = stack[depth].u1; + i = stack[depth].i; } else { - /* split the curve into two curves and process */ - level = level >> 1; + break; } } - return hit; + return found; +} + +/* Ribbons */ + +ccl_device_inline bool cylinder_culling_test(const float2 p1, const float2 p2, const float r) +{ + /* Performs culling against a cylinder. */ + const float2 dp = p2 - p1; + const float num = dp.x * p1.y - dp.y * p1.x; + const float den2 = dot(p2 - p1, p2 - p1); + return num * num <= r * r * den2; +} + +/*! Intersects a ray with a quad with backface culling + * enabled. The quad v0,v1,v2,v3 is split into two triangles + * v0,v1,v3 and v2,v3,v1. The edge v1,v2 decides which of the two + * triangles gets intersected. */ +ccl_device_inline bool ribbon_intersect_quad(const float ray_tfar, + const float3 quad_v0, + const float3 quad_v1, + const float3 quad_v2, + const float3 quad_v3, + float *u_o, + float *v_o, + float *t_o) +{ + /* Calculate vertices relative to ray origin? */ + const float3 O = make_float3(0.0f, 0.0f, 0.0f); + const float3 D = make_float3(0.0f, 0.0f, 1.0f); + const float3 va = quad_v0 - O; + const float3 vb = quad_v1 - O; + const float3 vc = quad_v2 - O; + const float3 vd = quad_v3 - O; + + const float3 edb = vb - vd; + const float WW = dot(cross(vd, edb), D); + const float3 v0 = (WW <= 0.0f) ? va : vc; + const float3 v1 = (WW <= 0.0f) ? vb : vd; + const float3 v2 = (WW <= 0.0f) ? vd : vb; + + /* Calculate edges? */ + const float3 e0 = v2 - v0; + const float3 e1 = v0 - v1; + + /* perform edge tests */ + const float U = dot(cross(v0, e0), D); + const float V = dot(cross(v1, e1), D); + if (!(max(U, V) <= 0.0f)) { + return false; + } + + /* Calculate geometry normal and denominator? */ + const float3 Ng = cross(e1, e0); + const float den = dot(Ng, D); + const float rcpDen = 1.0f / den; + + /* Perform depth test? */ + const float t = rcpDen * dot(v0, Ng); + if (!(0.0f <= t && t <= ray_tfar)) { + return false; + } + + /* Avoid division by 0? */ + if (!(den != 0.0f)) { + return false; + } + + /* Update hit information? */ + *t_o = t; + *u_o = U * rcpDen; + *v_o = V * rcpDen; + *u_o = (WW <= 0.0f) ? *u_o : 1.0f - *u_o; + *v_o = (WW <= 0.0f) ? *v_o : 1.0f - *v_o; + return true; +} + +ccl_device_inline void ribbon_ray_space(const float3 ray_dir, float3 ray_space[3]) +{ + const float3 dx0 = make_float3(0, ray_dir.z, -ray_dir.y); + const float3 dx1 = make_float3(-ray_dir.z, 0, ray_dir.x); + ray_space[0] = normalize(dot(dx0, dx0) > dot(dx1, dx1) ? dx0 : dx1); + ray_space[1] = normalize(cross(ray_dir, ray_space[0])); + ray_space[2] = ray_dir; +} + +ccl_device_inline float4 ribbon_to_ray_space(const float3 ray_space[3], + const float3 ray_org, + const float4 P4) +{ + float3 P = float4_to_float3(P4) - ray_org; + return make_float4(dot(ray_space[0], P), dot(ray_space[1], P), dot(ray_space[2], P), P4.w); +} + +ccl_device_inline bool ribbon_intersect(const float3 ray_org, + const float3 ray_dir, + const float ray_tfar, + const int N, + float4 curve[4], + Intersection *isect) +{ + /* Transform control points into ray space. */ + float3 ray_space[3]; + ribbon_ray_space(ray_dir, ray_space); + + curve[0] = ribbon_to_ray_space(ray_space, ray_org, curve[0]); + curve[1] = ribbon_to_ray_space(ray_space, ray_org, curve[1]); + curve[2] = ribbon_to_ray_space(ray_space, ray_org, curve[2]); + curve[3] = ribbon_to_ray_space(ray_space, ray_org, curve[3]); + + const float4 mx = max(max(fabs(curve[0]), fabs(curve[1])), max(fabs(curve[2]), fabs(curve[3]))); + const float eps = 4.0f * FLT_EPSILON * max(max(mx.x, mx.y), max(mx.z, mx.w)); + const float step_size = 1.0f / (float)N; + + /* Evaluate first point and radius scaled normal direction. */ + float4 p0 = catmull_rom_basis_eval(curve, 0.0f); + float3 dp0dt = float4_to_float3(catmull_rom_basis_derivative(curve, 0.0f)); + if (max3(fabs(dp0dt)) < eps) { + const float4 p1 = catmull_rom_basis_eval(curve, step_size); + dp0dt = float4_to_float3(p1 - p0); + } + float3 wn0 = normalize(make_float3(dp0dt.y, -dp0dt.x, 0.0f)) * p0.w; + + /* Evaluate the bezier curve. */ + for (int i = 0; i < N; i++) { + const float u = i * step_size; + const float4 p1 = catmull_rom_basis_eval(curve, u + step_size); + bool valid = cylinder_culling_test( + make_float2(p0.x, p0.y), make_float2(p1.x, p1.y), max(p0.w, p1.w)); + if (!valid) { + continue; + } + + /* Evaluate next point. */ + float3 dp1dt = float4_to_float3(catmull_rom_basis_derivative(curve, u + step_size)); + dp1dt = (max3(fabs(dp1dt)) < eps) ? float4_to_float3(p1 - p0) : dp1dt; + const float3 wn1 = normalize(make_float3(dp1dt.y, -dp1dt.x, 0.0f)) * p1.w; + + /* Construct quad coordinates. */ + const float3 lp0 = float4_to_float3(p0) + wn0; + const float3 lp1 = float4_to_float3(p1) + wn1; + const float3 up0 = float4_to_float3(p0) - wn0; + const float3 up1 = float4_to_float3(p1) - wn1; + + /* Intersect quad. */ + float vu, vv, vt; + bool valid0 = ribbon_intersect_quad(isect->t, lp0, lp1, up1, up0, &vu, &vv, &vt); + + if (valid0) { + /* ignore self intersections */ + const float avoidance_factor = 2.0f; + if (avoidance_factor != 0.0f) { + float r = mix(p0.w, p1.w, vu); + valid0 = vt > avoidance_factor * r; + } + + if (valid0) { + vv = 2.0f * vv - 1.0f; + + /* Record intersection. */ + isect->t = vt; + isect->u = u + vu * step_size; + isect->v = vv; + return true; + } + } + + p0 = p1; + wn0 = wn1; + } + return false; } ccl_device_forceinline bool curve_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, - float3 direction, + const float3 P, + const float3 dir, uint visibility, int object, int curveAddr, float time, int type) { - /* define few macros to minimize code duplication for SSE */ -# ifndef __KERNEL_SSE2__ -# define len3_squared(x) len_squared(x) -# define len3(x) len(x) -# define dot3(x, y) dot(x, y) -# endif - - const bool is_curve_primitive = (type & PRIMITIVE_CURVE); + const bool is_motion = (type & PRIMITIVE_ALL_MOTION); -# ifndef __KERNEL_OPTIX__ /* see OptiX motion flag OPTIX_MOTION_FLAG_[START|END]_VANISH */ - if (!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { +# ifndef __KERNEL_OPTIX__ /* See OptiX motion flag OPTIX_MOTION_FLAG_[START|END]_VANISH */ + if (is_motion && kernel_data.bvh.use_bvh_steps) { const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); if (time < prim_time.x || time > prim_time.y) { return false; @@ -517,210 +639,63 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg, # endif int segment = PRIMITIVE_UNPACK_SEGMENT(type); - /* curve Intersection check */ - int flags = kernel_data.curve.curveflags; - int prim = kernel_tex_fetch(__prim_index, curveAddr); + float4 v00 = kernel_tex_fetch(__curves, prim); - int cnum = __float_as_int(v00.x); - int k0 = cnum + segment; + int k0 = __float_as_int(v00.x) + segment; int k1 = k0 + 1; -# ifndef __KERNEL_SSE2__ - float4 P_curve[2]; + int ka = max(k0 - 1, __float_as_int(v00.x)); + int kb = min(k1 + 1, __float_as_int(v00.x) + __float_as_int(v00.y) - 1); - if (is_curve_primitive) { - P_curve[0] = kernel_tex_fetch(__curve_keys, k0); - P_curve[1] = kernel_tex_fetch(__curve_keys, k1); + float4 curve[4]; + if (!is_motion) { + curve[0] = kernel_tex_fetch(__curve_keys, ka); + curve[1] = kernel_tex_fetch(__curve_keys, k0); + curve[2] = kernel_tex_fetch(__curve_keys, k1); + curve[3] = kernel_tex_fetch(__curve_keys, kb); } else { int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object; - motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve); - } - - float r1 = P_curve[0].w; - float r2 = P_curve[1].w; - float3 p1 = float4_to_float3(P_curve[0]); - float3 p2 = float4_to_float3(P_curve[1]); - - /* minimum width extension */ - float3 dif = P - p1; - float3 dif_second = P - p2; - - float3 p21_diff = p2 - p1; - float3 sphere_dif1 = (dif + dif_second) * 0.5f; - float3 dir = direction; - float sphere_b_tmp = dot3(dir, sphere_dif1); - float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; -# else - ssef P_curve[2]; - - if (is_curve_primitive) { - P_curve[0] = load4f(&kg->__curve_keys.data[k0].x); - P_curve[1] = load4f(&kg->__curve_keys.data[k1].x); + motion_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, curve); } - else { - int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object; - motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4 *)&P_curve); - } - - ssef r12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]); - const ssef vP = load4f(P); - const ssef dif = vP - P_curve[0]; - const ssef dif_second = vP - P_curve[1]; - float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12)); - - const ssef p21_diff = P_curve[1] - P_curve[0]; - const ssef sphere_dif1 = (dif + dif_second) * 0.5f; - const ssef dir = load4f(direction); - const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1); - const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1); -# endif - - float mr = max(r1, r2); - float l = len3(p21_diff); - float invl = 1.0f / l; - float sp_r = mr + 0.5f * l; - float sphere_b = dot3(dir, sphere_dif2); - float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r; - - if (sdisc < 0.0f) - return false; - - /* obtain parameters and test midpoint distance for suitable modes */ -# ifndef __KERNEL_SSE2__ - float3 tg = p21_diff * invl; -# else - const ssef tg = p21_diff * invl; -# endif - float gd = (r2 - r1) * invl; - - float dirz = dot3(dir, tg); - float difz = dot3(dif, tg); - - float a = 1.0f - (dirz * dirz * (1 + gd * gd)); - - float halfb = dot3(dir, dif) - dirz * (difz + gd * (difz * gd + r1)); - - float tcentre = -halfb / a; - float zcentre = difz + (dirz * tcentre); - - if ((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE)) - return false; - if ((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && - !(flags & CURVE_KN_INTERSECTCORRECTION)) +# ifdef __VISIBILITY_FLAG__ + if (!(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)) { return false; - - /* test minimum separation */ -# ifndef __KERNEL_SSE2__ - float3 cprod = cross(tg, dir); - float cprod2sq = len3_squared(cross(tg, dif)); -# else - const ssef cprod = cross(tg, dir); - float cprod2sq = len3_squared(cross_zxy(tg, dif)); + } # endif - float cprodsq = len3_squared(cprod); - float distscaled = dot3(cprod, dif); - - if (cprodsq == 0) - distscaled = cprod2sq; - else - distscaled = (distscaled * distscaled) / cprodsq; - - if (distscaled > mr * mr) - return false; - /* calculate true intersection */ -# ifndef __KERNEL_SSE2__ - float3 tdif = dif + tcentre * dir; -# else - const ssef tdif = madd(ssef(tcentre), dir, dif); -# endif - float tdifz = dot3(tdif, tg); - float tdifma = tdifz * gd + r1; - float tb = 2 * (dot3(dir, tdif) - dirz * (tdifz + gd * tdifma)); - float tc = dot3(tdif, tdif) - tdifz * tdifz - tdifma * tdifma; - float td = tb * tb - 4 * a * tc; + if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) { + /* todo: adaptive number of subdivisions could help performance here. */ + const int subdivisions = kernel_data.bvh.curve_subdivisions; + if (ribbon_intersect(P, dir, isect->t, subdivisions, curve, isect)) { + isect->prim = curveAddr; + isect->object = object; + isect->type = type; + return true; + } - if (td < 0.0f) return false; - - float rootd = 0.0f; - float correction = 0.0f; - if (flags & CURVE_KN_ACCURATE) { - rootd = sqrtf(td); - correction = ((-tb - rootd) / (2 * a)); } - - float t = tcentre + correction; - - if (t < isect->t) { - - if (flags & CURVE_KN_INTERSECTCORRECTION) { - rootd = sqrtf(td); - correction = ((-tb - rootd) / (2 * a)); - t = tcentre + correction; - } - - float z = zcentre + (dirz * correction); - // bool backface = false; - - if (flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) { - // backface = true; - correction = ((-tb + rootd) / (2 * a)); - t = tcentre + correction; - z = zcentre + (dirz * correction); + else { + if (curve_intersect_recursive(P, dir, curve, isect)) { + isect->prim = curveAddr; + isect->object = object; + isect->type = type; + return true; } - if (t > 0.0f && t < isect->t && z >= 0 && z <= l) { - - if (flags & CURVE_KN_ENCLOSEFILTER) { - float enc_ratio = 1.01f; - if ((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) { - float a2 = 1.0f - (dirz * dirz * (1 + gd * gd * enc_ratio * enc_ratio)); - float c2 = dot3(dif, dif) - difz * difz * (1 + gd * gd * enc_ratio * enc_ratio) - - r1 * r1 * enc_ratio * enc_ratio - 2 * r1 * difz * gd * enc_ratio; - if (a2 * c2 < 0.0f) - return false; - } - } - -# ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if (kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) -# endif - { - /* record intersection */ - isect->t = t; - isect->u = z * invl; - isect->v = gd; - isect->prim = curveAddr; - isect->object = object; - isect->type = type; - - return true; - } - } + return false; } - - return false; - -# ifndef __KERNEL_SSE2__ -# undef len3_squared -# undef len3 -# undef dot3 -# endif } -ccl_device_inline float3 curve_refine(KernelGlobals *kg, - ShaderData *sd, - const Intersection *isect, - const Ray *ray) +ccl_device_inline void curve_shader_setup(KernelGlobals *kg, + ShaderData *sd, + const Intersection *isect, + const Ray *ray) { - int flag = kernel_data.curve.curveflags; float t = isect->t; float3 P = ray->P; float3 D = ray->D; @@ -743,118 +718,60 @@ ccl_device_inline float3 curve_refine(KernelGlobals *kg, int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; - float3 tg; + int ka = max(k0 - 1, __float_as_int(v00.x)); + int kb = min(k1 + 1, __float_as_int(v00.x) + __float_as_int(v00.y) - 1); - if (flag & CURVE_KN_INTERPOLATE) { - int ka = max(k0 - 1, __float_as_int(v00.x)); - int kb = min(k1 + 1, __float_as_int(v00.x) + __float_as_int(v00.y) - 1); + float4 P_curve[4]; - float4 P_curve[4]; + if (!(sd->type & PRIMITIVE_ALL_MOTION)) { + P_curve[0] = kernel_tex_fetch(__curve_keys, ka); + P_curve[1] = kernel_tex_fetch(__curve_keys, k0); + P_curve[2] = kernel_tex_fetch(__curve_keys, k1); + P_curve[3] = kernel_tex_fetch(__curve_keys, kb); + } + else { + motion_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve); + } - if (sd->type & PRIMITIVE_CURVE) { - P_curve[0] = kernel_tex_fetch(__curve_keys, ka); - P_curve[1] = kernel_tex_fetch(__curve_keys, k0); - P_curve[2] = kernel_tex_fetch(__curve_keys, k1); - P_curve[3] = kernel_tex_fetch(__curve_keys, kb); - } - else { - motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve); - } + sd->u = isect->u; + sd->v = isect->v; - float3 p[4]; - p[0] = float4_to_float3(P_curve[0]); - p[1] = float4_to_float3(P_curve[1]); - p[2] = float4_to_float3(P_curve[2]); - p[3] = float4_to_float3(P_curve[3]); + P = P + D * t; - P = P + D * t; + const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, isect->u); + const float3 dPdu = float4_to_float3(dPdu4); -# ifdef __UV__ - sd->u = isect->u; - sd->v = 0.0f; -# endif + if (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) { + /* Rounded smooth normals for ribbons, to approximate thick curve shape. */ + const float3 tangent = normalize(dPdu); + const float3 bitangent = normalize(cross(tangent, -D)); + const float sine = isect->v; + const float cosine = safe_sqrtf(1.0f - sine * sine); - tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); + sd->N = normalize(sine * bitangent - cosine * normalize(cross(tangent, bitangent))); + sd->Ng = -D; - if (kernel_data.curve.curveflags & CURVE_KN_RIBBONS) { - sd->Ng = normalize(-(D - tg * (dot(tg, D)))); - } - else { -# ifdef __EMBREE__ - if (kernel_data.bvh.scene) { - sd->Ng = normalize(isect->Ng); - } - else +# if 0 + /* This approximates the position and geometric normal of a thick curve too, + * but gives too many issues with wrong self intersections. */ + const float dPdu_radius = dPdu4.w; + sd->Ng = sd->N; + P += sd->N * dPdu_radius; # endif - { - /* direction from inside to surface of curve */ - float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); - sd->Ng = normalize(P - p_curr); - - /* adjustment for changing radius */ - float gd = isect->v; - - if (gd != 0.0f) { - sd->Ng = sd->Ng - gd * tg; - sd->Ng = normalize(sd->Ng); - } - } - } - - /* todo: sometimes the normal is still so that this is detected as - * backfacing even if cull backfaces is enabled */ - - sd->N = sd->Ng; } else { - float4 P_curve[2]; - - if (sd->type & PRIMITIVE_CURVE) { - P_curve[0] = kernel_tex_fetch(__curve_keys, k0); - P_curve[1] = kernel_tex_fetch(__curve_keys, k1); - } - else { - motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); - } - - float l = 1.0f; - tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l); - - P = P + D * t; - - float3 dif = P - float4_to_float3(P_curve[0]); - -# ifdef __UV__ - sd->u = dot(dif, tg) / l; - sd->v = 0.0f; -# endif - - if (flag & CURVE_KN_TRUETANGENTGNORMAL) { - sd->Ng = -(D - tg * dot(tg, D)); - sd->Ng = normalize(sd->Ng); - } - else { - float gd = isect->v; - - /* direction from inside to surface of curve */ - float denom = fmaxf(P_curve[0].w + sd->u * l * gd, 1e-8f); - sd->Ng = (dif - tg * sd->u * l) / denom; - - /* adjustment for changing radius */ - if (gd != 0.0f) { - sd->Ng = sd->Ng - gd * tg; - } - - sd->Ng = normalize(sd->Ng); - } - + /* Thick curves, compute normal using direction from inside the curve. + * This could be optimized by recording the normal in the intersection, + * however for Optix this would go beyond the size of the payload. */ + const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, isect->u)); + sd->Ng = normalize(P - P_inside); sd->N = sd->Ng; } # ifdef __DPDU__ /* dPdu/dPdv */ - sd->dPdu = tg; - sd->dPdv = cross(tg, sd->Ng); + sd->dPdu = dPdu; + sd->dPdv = cross(dPdu, sd->Ng); # endif if (isect->object != OBJECT_NONE) { @@ -867,7 +784,10 @@ ccl_device_inline float3 curve_refine(KernelGlobals *kg, P = transform_point(&tfm, P); } - return P; + sd->P = P; + + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + sd->shader = __float_as_int(curvedata.z); } #endif diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h index 0e2a00e9d2e..0f66f4af755 100644 --- a/intern/cycles/kernel/geom/geom_motion_curve.h +++ b/intern/cycles/kernel/geom/geom_motion_curve.h @@ -50,14 +50,14 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg, return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z; } -ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg, - int offset, - int numkeys, - int numsteps, - int step, - int k0, - int k1, - float4 keys[2]) +ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals *kg, + int offset, + int numkeys, + int numsteps, + int step, + int k0, + int k1, + float4 keys[2]) { if (step == numsteps) { /* center step: regular key location */ @@ -77,7 +77,7 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg, } /* return 2 curve key locations */ -ccl_device_inline void motion_curve_keys( +ccl_device_inline void motion_curve_keys_linear( KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2]) { /* get motion info */ @@ -97,24 +97,24 @@ ccl_device_inline void motion_curve_keys( /* fetch key coordinates */ float4 next_keys[2]; - motion_curve_keys_for_step(kg, offset, numkeys, numsteps, step, k0, k1, keys); - motion_curve_keys_for_step(kg, offset, numkeys, numsteps, step + 1, k0, k1, next_keys); + motion_curve_keys_for_step_linear(kg, offset, numkeys, numsteps, step, k0, k1, keys); + motion_curve_keys_for_step_linear(kg, offset, numkeys, numsteps, step + 1, k0, k1, next_keys); /* interpolate between steps */ keys[0] = (1.0f - t) * keys[0] + t * next_keys[0]; keys[1] = (1.0f - t) * keys[1] + t * next_keys[1]; } -ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg, - int offset, - int numkeys, - int numsteps, - int step, - int k0, - int k1, - int k2, - int k3, - float4 keys[4]) +ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg, + int offset, + int numkeys, + int numsteps, + int step, + int k0, + int k1, + int k2, + int k3, + float4 keys[4]) { if (step == numsteps) { /* center step: regular key location */ @@ -138,15 +138,15 @@ ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg, } /* return 2 curve key locations */ -ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, - int object, - int prim, - float time, - int k0, - int k1, - int k2, - int k3, - float4 keys[4]) +ccl_device_inline void motion_curve_keys(KernelGlobals *kg, + int object, + int prim, + float time, + int k0, + int k1, + int k2, + int k3, + float4 keys[4]) { /* get motion info */ int numsteps, numkeys; @@ -165,9 +165,8 @@ ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, /* fetch key coordinates */ float4 next_keys[4]; - motion_cardinal_curve_keys_for_step(kg, offset, numkeys, numsteps, step, k0, k1, k2, k3, keys); - motion_cardinal_curve_keys_for_step( - kg, offset, numkeys, numsteps, step + 1, k0, k1, k2, k3, next_keys); + motion_curve_keys_for_step(kg, offset, numkeys, numsteps, step, k0, k1, k2, k3, keys); + motion_curve_keys_for_step(kg, offset, numkeys, numsteps, step + 1, k0, k1, k2, k3, next_keys); /* interpolate between steps */ keys[0] = (1.0f - t) * keys[0] + t * next_keys[0]; @@ -176,53 +175,6 @@ ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, keys[3] = (1.0f - t) * keys[3] + t * next_keys[3]; } -# if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) -/* Similar to above, but returns keys as pair of two AVX registers with each - * holding two float4. - */ -ccl_device_inline void motion_cardinal_curve_keys_avx(KernelGlobals *kg, - int object, - int prim, - float time, - int k0, - int k1, - int k2, - int k3, - avxf *out_keys_0_1, - avxf *out_keys_2_3) -{ - /* Get motion info. */ - int numsteps, numkeys; - object_motion_info(kg, object, &numsteps, NULL, &numkeys); - - /* Figure out which steps we need to fetch and their interpolation factor. */ - int maxstep = numsteps * 2; - int step = min((int)(time * maxstep), maxstep - 1); - float t = time * maxstep - step; - - /* Find attribute. */ - AttributeElement elem; - int offset = find_attribute_curve_motion(kg, object, ATTR_STD_MOTION_VERTEX_POSITION, &elem); - kernel_assert(offset != ATTR_STD_NOT_FOUND); - - /* Fetch key coordinates. */ - float4 next_keys[4]; - float4 keys[4]; - motion_cardinal_curve_keys_for_step(kg, offset, numkeys, numsteps, step, k0, k1, k2, k3, keys); - motion_cardinal_curve_keys_for_step( - kg, offset, numkeys, numsteps, step + 1, k0, k1, k2, k3, next_keys); - - const avxf keys_0_1 = avxf(keys[0].m128, keys[1].m128); - const avxf keys_2_3 = avxf(keys[2].m128, keys[3].m128); - const avxf next_keys_0_1 = avxf(next_keys[0].m128, next_keys[1].m128); - const avxf next_keys_2_3 = avxf(next_keys[2].m128, next_keys[3].m128); - - /* Interpolate between steps. */ - *out_keys_0_1 = (1.0f - t) * keys_0_1 + t * next_keys_0_1; - *out_keys_2_3 = (1.0f - t) * keys_2_3 + t * next_keys_2_3; -} -# endif - #endif CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h index 3aa68e1f84e..614e2e3b92b 100644 --- a/intern/cycles/kernel/geom/geom_object.h +++ b/intern/cycles/kernel/geom/geom_object.h @@ -411,25 +411,10 @@ ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle) ccl_device_inline float3 bvh_clamp_direction(float3 dir) { - /* clamp absolute values by exp2f(-80.0f) to avoid division by zero when calculating inverse - * direction */ -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__) - const ssef oopes(8.271806E-25f, 8.271806E-25f, 8.271806E-25f, 0.0f); - const ssef mask = _mm_cmpgt_ps(fabs(dir), oopes); - const ssef signdir = signmsk(dir.m128) | oopes; -# ifndef __KERNEL_AVX__ - ssef res = mask & ssef(dir); - res = _mm_or_ps(res, _mm_andnot_ps(mask, signdir)); -# else - ssef res = _mm_blendv_ps(signdir, dir, mask); -# endif - return float3(res); -#else /* __KERNEL_SSE__ && __KERNEL_SSE2__ */ const float ooeps = 8.271806E-25f; return make_float3((fabsf(dir.x) > ooeps) ? dir.x : copysignf(ooeps, dir.x), (fabsf(dir.y) > ooeps) ? dir.y : copysignf(ooeps, dir.y), (fabsf(dir.z) > ooeps) ? dir.z : copysignf(ooeps, dir.z)); -#endif /* __KERNEL_SSE__ && __KERNEL_SSE2__ */ } ccl_device_inline float3 bvh_inverse_direction(float3 dir) @@ -457,38 +442,6 @@ ccl_device_inline float bvh_instance_push( return t; } -#ifdef __QBVH__ -/* Same as above, but optimized for QBVH scene intersection, - * which needs to modify two max distances. - * - * TODO(sergey): Investigate if passing NULL instead of t1 gets optimized - * so we can avoid having this duplication. - */ -ccl_device_inline void qbvh_instance_push(KernelGlobals *kg, - int object, - const Ray *ray, - float3 *P, - float3 *dir, - float3 *idir, - float *t, - float *t1) -{ - Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); - - *P = transform_point(&tfm, ray->P); - - float len; - *dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len)); - *idir = bvh_inverse_direction(*dir); - - if (*t != FLT_MAX) - *t *= len; - - if (*t1 != -FLT_MAX) - *t1 *= len; -} -#endif - /* Transorm ray to exit static object in BVH */ ccl_device_inline float bvh_instance_pop( @@ -551,39 +504,6 @@ ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg, return t; } -# ifdef __QBVH__ -/* Same as above, but optimized for QBVH scene intersection, - * which needs to modify two max distances. - * - * TODO(sergey): Investigate if passing NULL instead of t1 gets optimized - * so we can avoid having this duplication. - */ -ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg, - int object, - const Ray *ray, - float3 *P, - float3 *dir, - float3 *idir, - float *t, - float *t1, - Transform *itfm) -{ - object_fetch_transform_motion_test(kg, object, ray->time, itfm); - - *P = transform_point(itfm, ray->P); - - float len; - *dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len)); - *idir = bvh_inverse_direction(*dir); - - if (*t != FLT_MAX) - *t *= len; - - if (*t1 != -FLT_MAX) - *t1 *= len; -} -# endif - /* Transorm ray to exit motion blurred object in BVH */ ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg, diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h index 9a91da79f58..997abf438d0 100644 --- a/intern/cycles/kernel/geom/geom_primitive.h +++ b/intern/cycles/kernel/geom/geom_primitive.h @@ -174,6 +174,11 @@ ccl_device_inline float4 primitive_attribute_float4(KernelGlobals *kg, else return subd_triangle_attribute_float4(kg, sd, desc, dx, dy); } +#ifdef __HAIR__ + else if (sd->type & PRIMITIVE_ALL_CURVE) { + return curve_attribute_float4(kg, sd, desc, dx, dy); + } +#endif else { if (dx) *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index a2731bf2bd0..0278f3ade8e 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -312,12 +312,21 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals *kg, float4 *dx, float4 *dy) { - if (desc.element == ATTR_ELEMENT_CORNER_BYTE) { - int tri = desc.offset + sd->prim * 3; - - float4 f0 = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 0)); - float4 f1 = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 1)); - float4 f2 = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 2)); + if (desc.element == ATTR_ELEMENT_CORNER_BYTE || desc.element == ATTR_ELEMENT_VERTEX) { + float4 f0, f1, f2; + + if (desc.element == ATTR_ELEMENT_CORNER_BYTE) { + int tri = desc.offset + sd->prim * 3; + f0 = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 0)); + f1 = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 1)); + f2 = color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 2)); + } + else { + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); + f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x); + f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y); + f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z); + } #ifdef __RAY_DIFFERENTIALS__ if (dx) diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h index 6604806f73b..b0cce274b94 100644 --- a/intern/cycles/kernel/geom/geom_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h @@ -71,433 +71,6 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, return false; } -#ifdef __KERNEL_AVX2__ -# define cross256(A, B, C, D) _mm256_fmsub_ps(A, B, _mm256_mul_ps(C, D)) -ccl_device_inline int ray_triangle_intersect8(KernelGlobals *kg, - float3 ray_P, - float3 ray_dir, - Intersection **isect, - uint visibility, - int object, - __m256 *triA, - __m256 *triB, - __m256 *triC, - int prim_addr, - int prim_num, - uint *num_hits, - uint max_hits, - int *num_hits_in_instance, - float isect_t) -{ - - const unsigned char prim_num_mask = (1 << prim_num) - 1; - - const __m256i zero256 = _mm256_setzero_si256(); - - const __m256 Px256 = _mm256_set1_ps(ray_P.x); - const __m256 Py256 = _mm256_set1_ps(ray_P.y); - const __m256 Pz256 = _mm256_set1_ps(ray_P.z); - - const __m256 dirx256 = _mm256_set1_ps(ray_dir.x); - const __m256 diry256 = _mm256_set1_ps(ray_dir.y); - const __m256 dirz256 = _mm256_set1_ps(ray_dir.z); - - /* Calculate vertices relative to ray origin. */ - __m256 v0_x_256 = _mm256_sub_ps(triC[0], Px256); - __m256 v0_y_256 = _mm256_sub_ps(triC[1], Py256); - __m256 v0_z_256 = _mm256_sub_ps(triC[2], Pz256); - - __m256 v1_x_256 = _mm256_sub_ps(triA[0], Px256); - __m256 v1_y_256 = _mm256_sub_ps(triA[1], Py256); - __m256 v1_z_256 = _mm256_sub_ps(triA[2], Pz256); - - __m256 v2_x_256 = _mm256_sub_ps(triB[0], Px256); - __m256 v2_y_256 = _mm256_sub_ps(triB[1], Py256); - __m256 v2_z_256 = _mm256_sub_ps(triB[2], Pz256); - - __m256 v0_v1_x_256 = _mm256_add_ps(v0_x_256, v1_x_256); - __m256 v0_v1_y_256 = _mm256_add_ps(v0_y_256, v1_y_256); - __m256 v0_v1_z_256 = _mm256_add_ps(v0_z_256, v1_z_256); - - __m256 v0_v2_x_256 = _mm256_add_ps(v0_x_256, v2_x_256); - __m256 v0_v2_y_256 = _mm256_add_ps(v0_y_256, v2_y_256); - __m256 v0_v2_z_256 = _mm256_add_ps(v0_z_256, v2_z_256); - - __m256 v1_v2_x_256 = _mm256_add_ps(v1_x_256, v2_x_256); - __m256 v1_v2_y_256 = _mm256_add_ps(v1_y_256, v2_y_256); - __m256 v1_v2_z_256 = _mm256_add_ps(v1_z_256, v2_z_256); - - /* Calculate triangle edges. */ - __m256 e0_x_256 = _mm256_sub_ps(v2_x_256, v0_x_256); - __m256 e0_y_256 = _mm256_sub_ps(v2_y_256, v0_y_256); - __m256 e0_z_256 = _mm256_sub_ps(v2_z_256, v0_z_256); - - __m256 e1_x_256 = _mm256_sub_ps(v0_x_256, v1_x_256); - __m256 e1_y_256 = _mm256_sub_ps(v0_y_256, v1_y_256); - __m256 e1_z_256 = _mm256_sub_ps(v0_z_256, v1_z_256); - - __m256 e2_x_256 = _mm256_sub_ps(v1_x_256, v2_x_256); - __m256 e2_y_256 = _mm256_sub_ps(v1_y_256, v2_y_256); - __m256 e2_z_256 = _mm256_sub_ps(v1_z_256, v2_z_256); - - /* Perform edge tests. */ - /* cross (AyBz - AzBy, AzBx -AxBz, AxBy - AyBx) */ - __m256 U_x_256 = cross256(v0_v2_y_256, e0_z_256, v0_v2_z_256, e0_y_256); - __m256 U_y_256 = cross256(v0_v2_z_256, e0_x_256, v0_v2_x_256, e0_z_256); - __m256 U_z_256 = cross256(v0_v2_x_256, e0_y_256, v0_v2_y_256, e0_x_256); - /* vertical dot */ - __m256 U_256 = _mm256_mul_ps(U_x_256, dirx256); - U_256 = _mm256_fmadd_ps(U_y_256, diry256, U_256); - U_256 = _mm256_fmadd_ps(U_z_256, dirz256, U_256); - - __m256 V_x_256 = cross256(v0_v1_y_256, e1_z_256, v0_v1_z_256, e1_y_256); - __m256 V_y_256 = cross256(v0_v1_z_256, e1_x_256, v0_v1_x_256, e1_z_256); - __m256 V_z_256 = cross256(v0_v1_x_256, e1_y_256, v0_v1_y_256, e1_x_256); - /* vertical dot */ - __m256 V_256 = _mm256_mul_ps(V_x_256, dirx256); - V_256 = _mm256_fmadd_ps(V_y_256, diry256, V_256); - V_256 = _mm256_fmadd_ps(V_z_256, dirz256, V_256); - - __m256 W_x_256 = cross256(v1_v2_y_256, e2_z_256, v1_v2_z_256, e2_y_256); - __m256 W_y_256 = cross256(v1_v2_z_256, e2_x_256, v1_v2_x_256, e2_z_256); - __m256 W_z_256 = cross256(v1_v2_x_256, e2_y_256, v1_v2_y_256, e2_x_256); - /* vertical dot */ - __m256 W_256 = _mm256_mul_ps(W_x_256, dirx256); - W_256 = _mm256_fmadd_ps(W_y_256, diry256, W_256); - W_256 = _mm256_fmadd_ps(W_z_256, dirz256, W_256); - - __m256i U_256_1 = _mm256_srli_epi32(_mm256_castps_si256(U_256), 31); - __m256i V_256_1 = _mm256_srli_epi32(_mm256_castps_si256(V_256), 31); - __m256i W_256_1 = _mm256_srli_epi32(_mm256_castps_si256(W_256), 31); - __m256i UVW_256_1 = _mm256_add_epi32(_mm256_add_epi32(U_256_1, V_256_1), W_256_1); - - const __m256i one256 = _mm256_set1_epi32(1); - const __m256i two256 = _mm256_set1_epi32(2); - - __m256i mask_minmaxUVW_256 = _mm256_or_si256(_mm256_cmpeq_epi32(one256, UVW_256_1), - _mm256_cmpeq_epi32(two256, UVW_256_1)); - - unsigned char mask_minmaxUVW_pos = _mm256_movemask_ps(_mm256_castsi256_ps(mask_minmaxUVW_256)); - if ((mask_minmaxUVW_pos & prim_num_mask) == prim_num_mask) { // all bits set - return false; - } - - /* Calculate geometry normal and denominator. */ - __m256 Ng1_x_256 = cross256(e1_y_256, e0_z_256, e1_z_256, e0_y_256); - __m256 Ng1_y_256 = cross256(e1_z_256, e0_x_256, e1_x_256, e0_z_256); - __m256 Ng1_z_256 = cross256(e1_x_256, e0_y_256, e1_y_256, e0_x_256); - - Ng1_x_256 = _mm256_add_ps(Ng1_x_256, Ng1_x_256); - Ng1_y_256 = _mm256_add_ps(Ng1_y_256, Ng1_y_256); - Ng1_z_256 = _mm256_add_ps(Ng1_z_256, Ng1_z_256); - - /* vertical dot */ - __m256 den_256 = _mm256_mul_ps(Ng1_x_256, dirx256); - den_256 = _mm256_fmadd_ps(Ng1_y_256, diry256, den_256); - den_256 = _mm256_fmadd_ps(Ng1_z_256, dirz256, den_256); - - /* Perform depth test. */ - __m256 T_256 = _mm256_mul_ps(Ng1_x_256, v0_x_256); - T_256 = _mm256_fmadd_ps(Ng1_y_256, v0_y_256, T_256); - T_256 = _mm256_fmadd_ps(Ng1_z_256, v0_z_256, T_256); - - const __m256i c0x80000000 = _mm256_set1_epi32(0x80000000); - __m256i sign_den_256 = _mm256_and_si256(_mm256_castps_si256(den_256), c0x80000000); - - __m256 sign_T_256 = _mm256_castsi256_ps( - _mm256_xor_si256(_mm256_castps_si256(T_256), sign_den_256)); - - unsigned char mask_sign_T = _mm256_movemask_ps(sign_T_256); - if (((mask_minmaxUVW_pos | mask_sign_T) & prim_num_mask) == prim_num_mask) { - return false; - } - - __m256 xor_signmask_256 = _mm256_castsi256_ps( - _mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)); - - ccl_align(32) float den8[8], U8[8], V8[8], T8[8], sign_T8[8], xor_signmask8[8]; - ccl_align(32) unsigned int mask_minmaxUVW8[8]; - - if (visibility == PATH_RAY_SHADOW_OPAQUE) { - __m256i mask_final_256 = _mm256_cmpeq_epi32(mask_minmaxUVW_256, zero256); - __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256); - __m256i mask0 = _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(sign_T_256)); - __m256 rayt_256 = _mm256_set1_ps((*isect)->t); - __m256i mask1 = _mm256_cmpgt_epi32( - _mm256_castps_si256(sign_T_256), - _mm256_castps_si256(_mm256_mul_ps( - _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), - rayt_256))); - mask0 = _mm256_or_si256(mask1, mask0); - mask_final_256 = _mm256_andnot_si256(mask0, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) - mask_final_256 = _mm256_andnot_si256( - maskden256, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) & (~maskden) - int mask_final = _mm256_movemask_ps(_mm256_castsi256_ps(mask_final_256)); - if ((mask_final & prim_num_mask) == 0) { - return false; - } - while (mask_final != 0) { - const int i = __bscf(mask_final); - if (i >= prim_num) { - return false; - } -# ifdef __VISIBILITY_FLAG__ - if ((kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) == 0) { - continue; - } -# endif - __m256 inv_den_256 = _mm256_rcp_ps(den_256); - U_256 = _mm256_mul_ps(U_256, inv_den_256); - V_256 = _mm256_mul_ps(V_256, inv_den_256); - T_256 = _mm256_mul_ps(T_256, inv_den_256); - _mm256_store_ps(U8, U_256); - _mm256_store_ps(V8, V_256); - _mm256_store_ps(T8, T_256); - (*isect)->u = U8[i]; - (*isect)->v = V8[i]; - (*isect)->t = T8[i]; - (*isect)->prim = (prim_addr + i); - (*isect)->object = object; - (*isect)->type = PRIMITIVE_TRIANGLE; - return true; - } - return false; - } - else { - _mm256_store_ps(den8, den_256); - _mm256_store_ps(U8, U_256); - _mm256_store_ps(V8, V_256); - _mm256_store_ps(T8, T_256); - - _mm256_store_ps(sign_T8, sign_T_256); - _mm256_store_ps(xor_signmask8, xor_signmask_256); - _mm256_store_si256((__m256i *)mask_minmaxUVW8, mask_minmaxUVW_256); - - int ret = false; - - if (visibility == PATH_RAY_SHADOW) { - for (int i = 0; i < prim_num; i++) { - if (mask_minmaxUVW8[i]) { - continue; - } -# ifdef __VISIBILITY_FLAG__ - if ((kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) == 0) { - continue; - } -# endif - if ((sign_T8[i] < 0.0f) || (sign_T8[i] > (*isect)->t * xor_signmask8[i])) { - continue; - } - if (!den8[i]) { - continue; - } - const float inv_den = 1.0f / den8[i]; - (*isect)->u = U8[i] * inv_den; - (*isect)->v = V8[i] * inv_den; - (*isect)->t = T8[i] * inv_den; - (*isect)->prim = (prim_addr + i); - (*isect)->object = object; - (*isect)->type = PRIMITIVE_TRIANGLE; - const int prim = kernel_tex_fetch(__prim_index, (*isect)->prim); - int shader = 0; -# ifdef __HAIR__ - if (kernel_tex_fetch(__prim_type, (*isect)->prim) & PRIMITIVE_ALL_TRIANGLE) -# endif - { - shader = kernel_tex_fetch(__tri_shader, prim); - } -# ifdef __HAIR__ - else { - float4 str = kernel_tex_fetch(__curves, prim); - shader = __float_as_int(str.z); - } -# endif - const int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; - /* If no transparent shadows, all light is blocked. */ - if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) { - return 2; - } - /* If maximum number of hits reached, block all light. */ - else if (num_hits == NULL || *num_hits == max_hits) { - return 2; - } - /* Move on to next entry in intersections array. */ - ret = true; - (*isect)++; - (*num_hits)++; - (*num_hits_in_instance)++; - (*isect)->t = isect_t; - } - } - else { - for (int i = 0; i < prim_num; i++) { - if (mask_minmaxUVW8[i]) { - continue; - } -# ifdef __VISIBILITY_FLAG__ - if ((kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) == 0) { - continue; - } -# endif - if ((sign_T8[i] < 0.0f) || (sign_T8[i] > (*isect)->t * xor_signmask8[i])) { - continue; - } - if (!den8[i]) { - continue; - } - const float inv_den = 1.0f / den8[i]; - (*isect)->u = U8[i] * inv_den; - (*isect)->v = V8[i] * inv_den; - (*isect)->t = T8[i] * inv_den; - (*isect)->prim = (prim_addr + i); - (*isect)->object = object; - (*isect)->type = PRIMITIVE_TRIANGLE; - ret = true; - } - } - return ret; - } -} - -ccl_device_inline int triangle_intersect8(KernelGlobals *kg, - Intersection **isect, - float3 P, - float3 dir, - uint visibility, - int object, - int prim_addr, - int prim_num, - uint *num_hits, - uint max_hits, - int *num_hits_in_instance, - float isect_t) -{ - __m128 tri_a[8], tri_b[8], tri_c[8]; - __m256 tritmp[12], tri[12]; - __m256 triA[3], triB[3], triC[3]; - - int i, r; - - uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr); - for (i = 0; i < prim_num; i++) { - tri_a[i] = *(__m128 *)&kg->__prim_tri_verts.data[tri_vindex++]; - tri_b[i] = *(__m128 *)&kg->__prim_tri_verts.data[tri_vindex++]; - tri_c[i] = *(__m128 *)&kg->__prim_tri_verts.data[tri_vindex++]; - } - // create 9 or 12 placeholders - tri[0] = _mm256_castps128_ps256(tri_a[0]); //_mm256_zextps128_ps256 - tri[1] = _mm256_castps128_ps256(tri_b[0]); //_mm256_zextps128_ps256 - tri[2] = _mm256_castps128_ps256(tri_c[0]); //_mm256_zextps128_ps256 - - tri[3] = _mm256_castps128_ps256(tri_a[1]); //_mm256_zextps128_ps256 - tri[4] = _mm256_castps128_ps256(tri_b[1]); //_mm256_zextps128_ps256 - tri[5] = _mm256_castps128_ps256(tri_c[1]); //_mm256_zextps128_ps256 - - tri[6] = _mm256_castps128_ps256(tri_a[2]); //_mm256_zextps128_ps256 - tri[7] = _mm256_castps128_ps256(tri_b[2]); //_mm256_zextps128_ps256 - tri[8] = _mm256_castps128_ps256(tri_c[2]); //_mm256_zextps128_ps256 - - if (prim_num > 3) { - tri[9] = _mm256_castps128_ps256(tri_a[3]); //_mm256_zextps128_ps256 - tri[10] = _mm256_castps128_ps256(tri_b[3]); //_mm256_zextps128_ps256 - tri[11] = _mm256_castps128_ps256(tri_c[3]); //_mm256_zextps128_ps256 - } - - for (i = 4, r = 0; i < prim_num; i++, r += 3) { - tri[r] = _mm256_insertf128_ps(tri[r], tri_a[i], 1); - tri[r + 1] = _mm256_insertf128_ps(tri[r + 1], tri_b[i], 1); - tri[r + 2] = _mm256_insertf128_ps(tri[r + 2], tri_c[i], 1); - } - - //------------------------------------------------ - // 0! Xa0 Ya0 Za0 1 Xa4 Ya4 Za4 1 - // 1! Xb0 Yb0 Zb0 1 Xb4 Yb4 Zb4 1 - // 2! Xc0 Yc0 Zc0 1 Xc4 Yc4 Zc4 1 - - // 3! Xa1 Ya1 Za1 1 Xa5 Ya5 Za5 1 - // 4! Xb1 Yb1 Zb1 1 Xb5 Yb5 Zb5 1 - // 5! Xc1 Yc1 Zc1 1 Xc5 Yc5 Zc5 1 - - // 6! Xa2 Ya2 Za2 1 Xa6 Ya6 Za6 1 - // 7! Xb2 Yb2 Zb2 1 Xb6 Yb6 Zb6 1 - // 8! Xc2 Yc2 Zc2 1 Xc6 Yc6 Zc6 1 - - // 9! Xa3 Ya3 Za3 1 Xa7 Ya7 Za7 1 - // 10! Xb3 Yb3 Zb3 1 Xb7 Yb7 Zb7 1 - // 11! Xc3 Yc3 Zc3 1 Xc7 Yc7 Zc7 1 - - //"transpose" - tritmp[0] = _mm256_unpacklo_ps(tri[0], tri[3]); // 0! Xa0 Xa1 Ya0 Ya1 Xa4 Xa5 Ya4 Ya5 - tritmp[1] = _mm256_unpackhi_ps(tri[0], tri[3]); // 1! Za0 Za1 1 1 Za4 Za5 1 1 - - tritmp[2] = _mm256_unpacklo_ps(tri[6], tri[9]); // 2! Xa2 Xa3 Ya2 Ya3 Xa6 Xa7 Ya6 Ya7 - tritmp[3] = _mm256_unpackhi_ps(tri[6], tri[9]); // 3! Za2 Za3 1 1 Za6 Za7 1 1 - - tritmp[4] = _mm256_unpacklo_ps(tri[1], tri[4]); // 4! Xb0 Xb1 Yb0 Yb1 Xb4 Xb5 Yb4 Yb5 - tritmp[5] = _mm256_unpackhi_ps(tri[1], tri[4]); // 5! Zb0 Zb1 1 1 Zb4 Zb5 1 1 - - tritmp[6] = _mm256_unpacklo_ps(tri[7], tri[10]); // 6! Xb2 Xb3 Yb2 Yb3 Xb6 Xb7 Yb6 Yb7 - tritmp[7] = _mm256_unpackhi_ps(tri[7], tri[10]); // 7! Zb2 Zb3 1 1 Zb6 Zb7 1 1 - - tritmp[8] = _mm256_unpacklo_ps(tri[2], tri[5]); // 8! Xc0 Xc1 Yc0 Yc1 Xc4 Xc5 Yc4 Yc5 - tritmp[9] = _mm256_unpackhi_ps(tri[2], tri[5]); // 9! Zc0 Zc1 1 1 Zc4 Zc5 1 1 - - tritmp[10] = _mm256_unpacklo_ps(tri[8], tri[11]); // 10! Xc2 Xc3 Yc2 Yc3 Xc6 Xc7 Yc6 Yc7 - tritmp[11] = _mm256_unpackhi_ps(tri[8], tri[11]); // 11! Zc2 Zc3 1 1 Zc6 Zc7 1 1 - - /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ - triA[0] = _mm256_castpd_ps( - _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[0]), - _mm256_castps_pd(tritmp[2]))); // Xa0 Xa1 Xa2 Xa3 Xa4 Xa5 Xa6 Xa7 - triA[1] = _mm256_castpd_ps( - _mm256_unpackhi_pd(_mm256_castps_pd(tritmp[0]), - _mm256_castps_pd(tritmp[2]))); // Ya0 Ya1 Ya2 Ya3 Ya4 Ya5 Ya6 Ya7 - triA[2] = _mm256_castpd_ps( - _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[1]), - _mm256_castps_pd(tritmp[3]))); // Za0 Za1 Za2 Za3 Za4 Za5 Za6 Za7 - - triB[0] = _mm256_castpd_ps( - _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[4]), - _mm256_castps_pd(tritmp[6]))); // Xb0 Xb1 Xb2 Xb3 Xb4 Xb5 Xb5 Xb7 - triB[1] = _mm256_castpd_ps( - _mm256_unpackhi_pd(_mm256_castps_pd(tritmp[4]), - _mm256_castps_pd(tritmp[6]))); // Yb0 Yb1 Yb2 Yb3 Yb4 Yb5 Yb5 Yb7 - triB[2] = _mm256_castpd_ps( - _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[5]), - _mm256_castps_pd(tritmp[7]))); // Zb0 Zb1 Zb2 Zb3 Zb4 Zb5 Zb5 Zb7 - - triC[0] = _mm256_castpd_ps( - _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[8]), - _mm256_castps_pd(tritmp[10]))); // Xc0 Xc1 Xc2 Xc3 Xc4 Xc5 Xc6 Xc7 - triC[1] = _mm256_castpd_ps( - _mm256_unpackhi_pd(_mm256_castps_pd(tritmp[8]), - _mm256_castps_pd(tritmp[10]))); // Yc0 Yc1 Yc2 Yc3 Yc4 Yc5 Yc6 Yc7 - triC[2] = _mm256_castpd_ps( - _mm256_unpacklo_pd(_mm256_castps_pd(tritmp[9]), - _mm256_castps_pd(tritmp[11]))); // Zc0 Zc1 Zc2 Zc3 Zc4 Zc5 Zc6 Zc7 - - /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/ - - int result = ray_triangle_intersect8(kg, - P, - dir, - isect, - visibility, - object, - triA, - triB, - triC, - prim_addr, - prim_num, - num_hits, - max_hits, - num_hits_in_instance, - isect_t); - return result; -} - -#endif /* __KERNEL_AVX2__ */ - /* Special ray intersection routines for subsurface scattering. In that case we * only want to intersect with primitives in the same object, and if case of * multiple hits we pick a single random primitive as the intersection point. diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index 71b176a0a8f..4ac07d86dda 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -326,9 +326,7 @@ ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg, /* Background MIS weights. */ # ifdef __BACKGROUND_MIS__ /* Check if background light exists or if we should skip pdf. */ - int res_x = kernel_data.integrator.pdf_background_res_x; - - if (!(state->flag & PATH_RAY_MIS_SKIP) && res_x) { + if (!(state->flag & PATH_RAY_MIS_SKIP) && kernel_data.background.use_mis) { /* multiple importance sampling, get background light pdf for ray * direction, and compute weight with respect to BSDF pdf */ float pdf = background_light_pdf(kg, ray->P, ray->D); diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index 04472212d0c..138b90373a6 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "kernel_light_background.h" + CCL_NAMESPACE_BEGIN /* Light Sample result */ @@ -33,500 +35,6 @@ typedef struct LightSample { LightType type; /* type of light */ } LightSample; -/* Area light sampling */ - -/* Uses the following paper: - * - * Carlos Urena et al. - * An Area-Preserving Parametrization for Spherical Rectangles. - * - * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf - * - * Note: light_p is modified when sample_coord is true. - */ -ccl_device_inline float rect_light_sample(float3 P, - float3 *light_p, - float3 axisu, - float3 axisv, - float randu, - float randv, - bool sample_coord) -{ - /* In our name system we're using P for the center, - * which is o in the paper. - */ - - float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f; - float axisu_len, axisv_len; - /* Compute local reference system R. */ - float3 x = normalize_len(axisu, &axisu_len); - float3 y = normalize_len(axisv, &axisv_len); - float3 z = cross(x, y); - /* Compute rectangle coords in local reference system. */ - float3 dir = corner - P; - float z0 = dot(dir, z); - /* Flip 'z' to make it point against Q. */ - if (z0 > 0.0f) { - z *= -1.0f; - z0 *= -1.0f; - } - float x0 = dot(dir, x); - float y0 = dot(dir, y); - float x1 = x0 + axisu_len; - float y1 = y0 + axisv_len; - /* Compute internal angles (gamma_i). */ - float4 diff = make_float4(x0, y1, x1, y0) - make_float4(x1, y0, x0, y1); - float4 nz = make_float4(y0, x1, y1, x0) * diff; - nz = nz / sqrt(z0 * z0 * diff * diff + nz * nz); - float g0 = safe_acosf(-nz.x * nz.y); - float g1 = safe_acosf(-nz.y * nz.z); - float g2 = safe_acosf(-nz.z * nz.w); - float g3 = safe_acosf(-nz.w * nz.x); - /* Compute predefined constants. */ - float b0 = nz.x; - float b1 = nz.z; - float b0sq = b0 * b0; - float k = M_2PI_F - g2 - g3; - /* Compute solid angle from internal angles. */ - float S = g0 + g1 - k; - - if (sample_coord) { - /* Compute cu. */ - float au = randu * S + k; - float fu = (cosf(au) * b0 - b1) / sinf(au); - float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f); - cu = clamp(cu, -1.0f, 1.0f); - /* Compute xu. */ - float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f); - xu = clamp(xu, x0, x1); - /* Compute yv. */ - float z0sq = z0 * z0; - float y0sq = y0 * y0; - float y1sq = y1 * y1; - float d = sqrtf(xu * xu + z0sq); - float h0 = y0 / sqrtf(d * d + y0sq); - float h1 = y1 / sqrtf(d * d + y1sq); - float hv = h0 + randv * (h1 - h0), hv2 = hv * hv; - float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1; - - /* Transform (xu, yv, z0) to world coords. */ - *light_p = P + xu * x + yv * y + z0 * z; - } - - /* return pdf */ - if (S != 0.0f) - return 1.0f / S; - else - return 0.0f; -} - -ccl_device_inline float3 ellipse_sample(float3 ru, float3 rv, float randu, float randv) -{ - to_unit_disk(&randu, &randv); - return ru * randu + rv * randv; -} - -ccl_device float3 disk_light_sample(float3 v, float randu, float randv) -{ - float3 ru, rv; - - make_orthonormals(v, &ru, &rv); - - return ellipse_sample(ru, rv, randu, randv); -} - -ccl_device float3 distant_light_sample(float3 D, float radius, float randu, float randv) -{ - return normalize(D + disk_light_sample(D, randu, randv) * radius); -} - -ccl_device float3 -sphere_light_sample(float3 P, float3 center, float radius, float randu, float randv) -{ - return disk_light_sample(normalize(P - center), randu, randv) * radius; -} - -ccl_device float spot_light_attenuation(float3 dir, - float spot_angle, - float spot_smooth, - LightSample *ls) -{ - float3 I = ls->Ng; - - float attenuation = dot(dir, I); - - if (attenuation <= spot_angle) { - attenuation = 0.0f; - } - else { - float t = attenuation - spot_angle; - - if (t < spot_smooth && spot_smooth != 0.0f) - attenuation *= smoothstepf(t / spot_smooth); - } - - return attenuation; -} - -ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t) -{ - float cos_pi = dot(Ng, I); - - if (cos_pi <= 0.0f) - return 0.0f; - - return t * t / cos_pi; -} - -/* Background Light */ - -#ifdef __BACKGROUND_MIS__ - -ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf) -{ - /* for the following, the CDF values are actually a pair of floats, with the - * function value as X and the actual CDF as Y. The last entry's function - * value is the CDF total. */ - int res_x = kernel_data.integrator.pdf_background_res_x; - int res_y = kernel_data.integrator.pdf_background_res_y; - int cdf_width = res_x + 1; - - /* this is basically std::lower_bound as used by pbrt */ - int first = 0; - int count = res_y; - - while (count > 0) { - int step = count >> 1; - int middle = first + step; - - if (kernel_tex_fetch(__light_background_marginal_cdf, middle).y < randv) { - first = middle + 1; - count -= step + 1; - } - else - count = step; - } - - int index_v = max(0, first - 1); - kernel_assert(index_v >= 0 && index_v < res_y); - - float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v); - float2 cdf_next_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v + 1); - float2 cdf_last_v = kernel_tex_fetch(__light_background_marginal_cdf, res_y); - - /* importance-sampled V direction */ - float dv = inverse_lerp(cdf_v.y, cdf_next_v.y, randv); - float v = (index_v + dv) / res_y; - - /* this is basically std::lower_bound as used by pbrt */ - first = 0; - count = res_x; - while (count > 0) { - int step = count >> 1; - int middle = first + step; - - if (kernel_tex_fetch(__light_background_conditional_cdf, index_v * cdf_width + middle).y < - randu) { - first = middle + 1; - count -= step + 1; - } - else - count = step; - } - - int index_u = max(0, first - 1); - kernel_assert(index_u >= 0 && index_u < res_x); - - float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf, - index_v * cdf_width + index_u); - float2 cdf_next_u = kernel_tex_fetch(__light_background_conditional_cdf, - index_v * cdf_width + index_u + 1); - float2 cdf_last_u = kernel_tex_fetch(__light_background_conditional_cdf, - index_v * cdf_width + res_x); - - /* importance-sampled U direction */ - float du = inverse_lerp(cdf_u.y, cdf_next_u.y, randu); - float u = (index_u + du) / res_x; - - /* compute pdf */ - float sin_theta = sinf(M_PI_F * v); - float denom = (M_2PI_F * M_PI_F * sin_theta) * cdf_last_u.x * cdf_last_v.x; - - if (sin_theta == 0.0f || denom == 0.0f) - *pdf = 0.0f; - else - *pdf = (cdf_u.x * cdf_v.x) / denom; - - /* compute direction */ - return equirectangular_to_direction(u, v); -} - -/* TODO(sergey): Same as above, after the release we should consider using - * 'noinline' for all devices. - */ -ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction) -{ - float2 uv = direction_to_equirectangular(direction); - int res_x = kernel_data.integrator.pdf_background_res_x; - int res_y = kernel_data.integrator.pdf_background_res_y; - int cdf_width = res_x + 1; - - float sin_theta = sinf(uv.y * M_PI_F); - - if (sin_theta == 0.0f) - return 0.0f; - - int index_u = clamp(float_to_int(uv.x * res_x), 0, res_x - 1); - int index_v = clamp(float_to_int(uv.y * res_y), 0, res_y - 1); - - /* pdfs in V direction */ - float2 cdf_last_u = kernel_tex_fetch(__light_background_conditional_cdf, - index_v * cdf_width + res_x); - float2 cdf_last_v = kernel_tex_fetch(__light_background_marginal_cdf, res_y); - - float denom = (M_2PI_F * M_PI_F * sin_theta) * cdf_last_u.x * cdf_last_v.x; - - if (denom == 0.0f) - return 0.0f; - - /* pdfs in U direction */ - float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf, - index_v * cdf_width + index_u); - float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v); - - return (cdf_u.x * cdf_v.x) / denom; -} - -ccl_device_inline bool background_portal_data_fetch_and_check_side( - KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir) -{ - int portal = kernel_data.integrator.portal_offset + index; - const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal); - - *lightpos = make_float3(klight->co[0], klight->co[1], klight->co[2]); - *dir = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]); - - /* Check whether portal is on the right side. */ - if (dot(*dir, P - *lightpos) > 1e-4f) - return true; - - return false; -} - -ccl_device_inline float background_portal_pdf( - KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible) -{ - float portal_pdf = 0.0f; - - int num_possible = 0; - for (int p = 0; p < kernel_data.integrator.num_portals; p++) { - if (p == ignore_portal) - continue; - - float3 lightpos, dir; - if (!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir)) - continue; - - /* There's a portal that could be sampled from this position. */ - if (is_possible) { - *is_possible = true; - } - num_possible++; - - int portal = kernel_data.integrator.portal_offset + p; - const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal); - float3 axisu = make_float3( - klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]); - float3 axisv = make_float3( - klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]); - bool is_round = (klight->area.invarea < 0.0f); - - if (!ray_quad_intersect(P, - direction, - 1e-4f, - FLT_MAX, - lightpos, - axisu, - axisv, - dir, - NULL, - NULL, - NULL, - NULL, - is_round)) - continue; - - if (is_round) { - float t; - float3 D = normalize_len(lightpos - P, &t); - portal_pdf += fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t); - } - else { - portal_pdf += rect_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false); - } - } - - if (ignore_portal >= 0) { - /* We have skipped a portal that could be sampled as well. */ - num_possible++; - } - - return (num_possible > 0) ? portal_pdf / num_possible : 0.0f; -} - -ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P) -{ - int num_possible_portals = 0; - for (int p = 0; p < kernel_data.integrator.num_portals; p++) { - float3 lightpos, dir; - if (background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir)) - num_possible_portals++; - } - return num_possible_portals; -} - -ccl_device float3 background_portal_sample(KernelGlobals *kg, - float3 P, - float randu, - float randv, - int num_possible, - int *sampled_portal, - float *pdf) -{ - /* Pick a portal, then re-normalize randv. */ - randv *= num_possible; - int portal = (int)randv; - randv -= portal; - - /* TODO(sergey): Some smarter way of finding portal to sample - * is welcome. - */ - for (int p = 0; p < kernel_data.integrator.num_portals; p++) { - /* Search for the sampled portal. */ - float3 lightpos, dir; - if (!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir)) - continue; - - if (portal == 0) { - /* p is the portal to be sampled. */ - int portal = kernel_data.integrator.portal_offset + p; - const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal); - float3 axisu = make_float3( - klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]); - float3 axisv = make_float3( - klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]); - bool is_round = (klight->area.invarea < 0.0f); - - float3 D; - if (is_round) { - lightpos += ellipse_sample(axisu * 0.5f, axisv * 0.5f, randu, randv); - float t; - D = normalize_len(lightpos - P, &t); - *pdf = fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t); - } - else { - *pdf = rect_light_sample(P, &lightpos, axisu, axisv, randu, randv, true); - D = normalize(lightpos - P); - } - - *pdf /= num_possible; - *sampled_portal = p; - return D; - } - - portal--; - } - - return make_float3(0.0f, 0.0f, 0.0f); -} - -ccl_device_inline float3 -background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf) -{ - /* Probability of sampling portals instead of the map. */ - float portal_sampling_pdf = kernel_data.integrator.portal_pdf; - - /* Check if there are portals in the scene which we can sample. */ - if (portal_sampling_pdf > 0.0f) { - int num_portals = background_num_possible_portals(kg, P); - if (num_portals > 0) { - if (portal_sampling_pdf == 1.0f || randu < portal_sampling_pdf) { - if (portal_sampling_pdf < 1.0f) { - randu /= portal_sampling_pdf; - } - int portal; - float3 D = background_portal_sample(kg, P, randu, randv, num_portals, &portal, pdf); - if (num_portals > 1) { - /* Ignore the chosen portal, its pdf is already included. */ - *pdf += background_portal_pdf(kg, P, D, portal, NULL); - } - /* We could also have sampled the map, so combine with MIS. */ - if (portal_sampling_pdf < 1.0f) { - float cdf_pdf = background_map_pdf(kg, D); - *pdf = (portal_sampling_pdf * (*pdf) + (1.0f - portal_sampling_pdf) * cdf_pdf); - } - return D; - } - else { - /* Sample map, but with nonzero portal_sampling_pdf for MIS. */ - randu = (randu - portal_sampling_pdf) / (1.0f - portal_sampling_pdf); - } - } - else { - /* We can't sample a portal. - * Check if we can sample the map instead. - */ - if (portal_sampling_pdf == 1.0f) { - /* Use uniform as a fallback if we can't sample the map. */ - *pdf = 1.0f / M_4PI_F; - return sample_uniform_sphere(randu, randv); - } - else { - portal_sampling_pdf = 0.0f; - } - } - } - - float3 D = background_map_sample(kg, randu, randv, pdf); - /* Use MIS if portals could be sampled as well. */ - if (portal_sampling_pdf > 0.0f) { - float portal_pdf = background_portal_pdf(kg, P, D, -1, NULL); - *pdf = (portal_sampling_pdf * portal_pdf + (1.0f - portal_sampling_pdf) * (*pdf)); - } - return D; -} - -ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction) -{ - /* Probability of sampling portals instead of the map. */ - float portal_sampling_pdf = kernel_data.integrator.portal_pdf; - - float portal_pdf = 0.0f, map_pdf = 0.0f; - if (portal_sampling_pdf > 0.0f) { - /* Evaluate PDF of sampling this direction by portal sampling. */ - bool is_possible = false; - portal_pdf = background_portal_pdf(kg, P, direction, -1, &is_possible) * portal_sampling_pdf; - if (!is_possible) { - /* Portal sampling is not possible here because all portals point to the wrong side. - * If map sampling is possible, it would be used instead, - * otherwise fallback sampling is used. */ - if (portal_sampling_pdf == 1.0f) { - return kernel_data.integrator.pdf_lights / M_4PI_F; - } - else { - /* Force map sampling. */ - portal_sampling_pdf = 0.0f; - } - } - } - if (portal_sampling_pdf < 1.0f) { - /* Evaluate PDF of sampling this direction by map sampling. */ - map_pdf = background_map_pdf(kg, direction) * (1.0f - portal_sampling_pdf); - } - return (portal_pdf + map_pdf) * kernel_data.integrator.pdf_lights; -} -#endif - /* Regular Light */ ccl_device_inline bool lamp_light_sample( @@ -594,7 +102,7 @@ ccl_device_inline bool lamp_light_sample( /* spot light attenuation */ float3 dir = make_float3(klight->spot.dir[0], klight->spot.dir[1], klight->spot.dir[2]); ls->eval_fac *= spot_light_attenuation( - dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls); + dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls->Ng); if (ls->eval_fac == 0.0f) { return false; } @@ -732,7 +240,7 @@ ccl_device bool lamp_light_eval( /* spot light attenuation */ float3 dir = make_float3(klight->spot.dir[0], klight->spot.dir[1], klight->spot.dir[2]); ls->eval_fac *= spot_light_attenuation( - dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls); + dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls->Ng); if (ls->eval_fac == 0.0f) return false; @@ -805,20 +313,18 @@ ccl_device_inline bool triangle_world_space_vertices( triangle_vertices(kg, prim, V); } -#ifdef __INSTANCING__ if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { -# ifdef __OBJECT_MOTION__ +#ifdef __OBJECT_MOTION__ float object_time = (time >= 0.0f) ? time : 0.5f; Transform tfm = object_fetch_transform_motion_test(kg, object, object_time, NULL); -# else +#else Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); -# endif +#endif V[0] = transform_point(&tfm, V[0]); V[1] = transform_point(&tfm, V[1]); V[2] = transform_point(&tfm, V[2]); has_motion = true; } -#endif return has_motion; } diff --git a/intern/cycles/kernel/kernel_light_background.h b/intern/cycles/kernel/kernel_light_background.h new file mode 100644 index 00000000000..30e336f0f80 --- /dev/null +++ b/intern/cycles/kernel/kernel_light_background.h @@ -0,0 +1,448 @@ +/* + * Copyright 2011-2020 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_light_common.h" + +CCL_NAMESPACE_BEGIN + +/* Background Light */ + +#ifdef __BACKGROUND_MIS__ + +ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf) +{ + /* for the following, the CDF values are actually a pair of floats, with the + * function value as X and the actual CDF as Y. The last entry's function + * value is the CDF total. */ + int res_x = kernel_data.background.map_res_x; + int res_y = kernel_data.background.map_res_y; + int cdf_width = res_x + 1; + + /* this is basically std::lower_bound as used by pbrt */ + int first = 0; + int count = res_y; + + while (count > 0) { + int step = count >> 1; + int middle = first + step; + + if (kernel_tex_fetch(__light_background_marginal_cdf, middle).y < randv) { + first = middle + 1; + count -= step + 1; + } + else + count = step; + } + + int index_v = max(0, first - 1); + kernel_assert(index_v >= 0 && index_v < res_y); + + float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v); + float2 cdf_next_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v + 1); + float2 cdf_last_v = kernel_tex_fetch(__light_background_marginal_cdf, res_y); + + /* importance-sampled V direction */ + float dv = inverse_lerp(cdf_v.y, cdf_next_v.y, randv); + float v = (index_v + dv) / res_y; + + /* this is basically std::lower_bound as used by pbrt */ + first = 0; + count = res_x; + while (count > 0) { + int step = count >> 1; + int middle = first + step; + + if (kernel_tex_fetch(__light_background_conditional_cdf, index_v * cdf_width + middle).y < + randu) { + first = middle + 1; + count -= step + 1; + } + else + count = step; + } + + int index_u = max(0, first - 1); + kernel_assert(index_u >= 0 && index_u < res_x); + + float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf, + index_v * cdf_width + index_u); + float2 cdf_next_u = kernel_tex_fetch(__light_background_conditional_cdf, + index_v * cdf_width + index_u + 1); + float2 cdf_last_u = kernel_tex_fetch(__light_background_conditional_cdf, + index_v * cdf_width + res_x); + + /* importance-sampled U direction */ + float du = inverse_lerp(cdf_u.y, cdf_next_u.y, randu); + float u = (index_u + du) / res_x; + + /* compute pdf */ + float sin_theta = sinf(M_PI_F * v); + float denom = (M_2PI_F * M_PI_F * sin_theta) * cdf_last_u.x * cdf_last_v.x; + + if (sin_theta == 0.0f || denom == 0.0f) + *pdf = 0.0f; + else + *pdf = (cdf_u.x * cdf_v.x) / denom; + + /* compute direction */ + return equirectangular_to_direction(u, v); +} + +/* TODO(sergey): Same as above, after the release we should consider using + * 'noinline' for all devices. + */ +ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction) +{ + float2 uv = direction_to_equirectangular(direction); + int res_x = kernel_data.background.map_res_x; + int res_y = kernel_data.background.map_res_y; + int cdf_width = res_x + 1; + + float sin_theta = sinf(uv.y * M_PI_F); + + if (sin_theta == 0.0f) + return 0.0f; + + int index_u = clamp(float_to_int(uv.x * res_x), 0, res_x - 1); + int index_v = clamp(float_to_int(uv.y * res_y), 0, res_y - 1); + + /* pdfs in V direction */ + float2 cdf_last_u = kernel_tex_fetch(__light_background_conditional_cdf, + index_v * cdf_width + res_x); + float2 cdf_last_v = kernel_tex_fetch(__light_background_marginal_cdf, res_y); + + float denom = (M_2PI_F * M_PI_F * sin_theta) * cdf_last_u.x * cdf_last_v.x; + + if (denom == 0.0f) + return 0.0f; + + /* pdfs in U direction */ + float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf, + index_v * cdf_width + index_u); + float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v); + + return (cdf_u.x * cdf_v.x) / denom; +} + +ccl_device_inline bool background_portal_data_fetch_and_check_side( + KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir) +{ + int portal = kernel_data.background.portal_offset + index; + const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal); + + *lightpos = make_float3(klight->co[0], klight->co[1], klight->co[2]); + *dir = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]); + + /* Check whether portal is on the right side. */ + if (dot(*dir, P - *lightpos) > 1e-4f) + return true; + + return false; +} + +ccl_device_inline float background_portal_pdf( + KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible) +{ + float portal_pdf = 0.0f; + + int num_possible = 0; + for (int p = 0; p < kernel_data.background.num_portals; p++) { + if (p == ignore_portal) + continue; + + float3 lightpos, dir; + if (!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir)) + continue; + + /* There's a portal that could be sampled from this position. */ + if (is_possible) { + *is_possible = true; + } + num_possible++; + + int portal = kernel_data.background.portal_offset + p; + const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal); + float3 axisu = make_float3( + klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]); + float3 axisv = make_float3( + klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]); + bool is_round = (klight->area.invarea < 0.0f); + + if (!ray_quad_intersect(P, + direction, + 1e-4f, + FLT_MAX, + lightpos, + axisu, + axisv, + dir, + NULL, + NULL, + NULL, + NULL, + is_round)) + continue; + + if (is_round) { + float t; + float3 D = normalize_len(lightpos - P, &t); + portal_pdf += fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t); + } + else { + portal_pdf += rect_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false); + } + } + + if (ignore_portal >= 0) { + /* We have skipped a portal that could be sampled as well. */ + num_possible++; + } + + return (num_possible > 0) ? portal_pdf / num_possible : 0.0f; +} + +ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P) +{ + int num_possible_portals = 0; + for (int p = 0; p < kernel_data.background.num_portals; p++) { + float3 lightpos, dir; + if (background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir)) + num_possible_portals++; + } + return num_possible_portals; +} + +ccl_device float3 background_portal_sample(KernelGlobals *kg, + float3 P, + float randu, + float randv, + int num_possible, + int *sampled_portal, + float *pdf) +{ + /* Pick a portal, then re-normalize randv. */ + randv *= num_possible; + int portal = (int)randv; + randv -= portal; + + /* TODO(sergey): Some smarter way of finding portal to sample + * is welcome. + */ + for (int p = 0; p < kernel_data.background.num_portals; p++) { + /* Search for the sampled portal. */ + float3 lightpos, dir; + if (!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir)) + continue; + + if (portal == 0) { + /* p is the portal to be sampled. */ + int portal = kernel_data.background.portal_offset + p; + const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal); + float3 axisu = make_float3( + klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]); + float3 axisv = make_float3( + klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]); + bool is_round = (klight->area.invarea < 0.0f); + + float3 D; + if (is_round) { + lightpos += ellipse_sample(axisu * 0.5f, axisv * 0.5f, randu, randv); + float t; + D = normalize_len(lightpos - P, &t); + *pdf = fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t); + } + else { + *pdf = rect_light_sample(P, &lightpos, axisu, axisv, randu, randv, true); + D = normalize(lightpos - P); + } + + *pdf /= num_possible; + *sampled_portal = p; + return D; + } + + portal--; + } + + return make_float3(0.0f, 0.0f, 0.0f); +} + +ccl_device_inline float3 background_sun_sample(KernelGlobals *kg, + float randu, + float randv, + float *pdf) +{ + float3 D; + const float3 N = float4_to_float3(kernel_data.background.sun); + const float angle = kernel_data.background.sun.w; + sample_uniform_cone(N, angle, randu, randv, &D, pdf); + return D; +} + +ccl_device_inline float background_sun_pdf(KernelGlobals *kg, float3 D) +{ + const float3 N = float4_to_float3(kernel_data.background.sun); + const float angle = kernel_data.background.sun.w; + return pdf_uniform_cone(N, D, angle); +} + +ccl_device_inline float3 +background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf) +{ + float portal_method_pdf = kernel_data.background.portal_weight; + float sun_method_pdf = kernel_data.background.sun_weight; + float map_method_pdf = kernel_data.background.map_weight; + + int num_portals = 0; + if (portal_method_pdf > 0.0f) { + /* Check if there are portals in the scene which we can sample. */ + num_portals = background_num_possible_portals(kg, P); + if (num_portals == 0) { + portal_method_pdf = 0.0f; + } + } + + float pdf_fac = (portal_method_pdf + sun_method_pdf + map_method_pdf); + if (pdf_fac == 0.0f) { + /* Use uniform as a fallback if we can't use any strategy. */ + *pdf = 1.0f / M_4PI_F; + return sample_uniform_sphere(randu, randv); + } + + pdf_fac = 1.0f / pdf_fac; + portal_method_pdf *= pdf_fac; + sun_method_pdf *= pdf_fac; + map_method_pdf *= pdf_fac; + + /* We have 100% in total and split it between the three categories. + * Therefore, we pick portals if randu is between 0 and portal_method_pdf, + * sun if randu is between portal_method_pdf and (portal_method_pdf + sun_method_pdf) + * and map if randu is between (portal_method_pdf + sun_method_pdf) and 1. */ + float sun_method_cdf = portal_method_pdf + sun_method_pdf; + + int method = 0; + float3 D; + if (randu < portal_method_pdf) { + method = 0; + /* Rescale randu. */ + if (portal_method_pdf != 1.0f) { + randu /= portal_method_pdf; + } + + /* Sample a portal. */ + int portal; + D = background_portal_sample(kg, P, randu, randv, num_portals, &portal, pdf); + if (num_portals > 1) { + /* Ignore the chosen portal, its pdf is already included. */ + *pdf += background_portal_pdf(kg, P, D, portal, NULL); + } + + /* Skip MIS if this is the only method. */ + if (portal_method_pdf == 1.0f) { + return D; + } + *pdf *= portal_method_pdf; + } + else if (randu < sun_method_cdf) { + method = 1; + /* Rescale randu. */ + if (sun_method_pdf != 1.0f) { + randu = (randu - portal_method_pdf) / sun_method_pdf; + } + + D = background_sun_sample(kg, randu, randv, pdf); + + /* Skip MIS if this is the only method. */ + if (sun_method_pdf == 1.0f) { + return D; + } + *pdf *= sun_method_pdf; + } + else { + method = 2; + /* Rescale randu. */ + if (map_method_pdf != 1.0f) { + randu = (randu - sun_method_cdf) / map_method_pdf; + } + + D = background_map_sample(kg, randu, randv, pdf); + + /* Skip MIS if this is the only method. */ + if (map_method_pdf == 1.0f) { + return D; + } + *pdf *= map_method_pdf; + } + + /* MIS weighting. */ + if (method != 0 && portal_method_pdf != 0.0f) { + *pdf += portal_method_pdf * background_portal_pdf(kg, P, D, -1, NULL); + } + if (method != 1 && sun_method_pdf != 0.0f) { + *pdf += sun_method_pdf * background_sun_pdf(kg, D); + } + if (method != 2 && map_method_pdf != 0.0f) { + *pdf += map_method_pdf * background_map_pdf(kg, D); + } + return D; +} + +ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction) +{ + float portal_method_pdf = kernel_data.background.portal_weight; + float sun_method_pdf = kernel_data.background.sun_weight; + float map_method_pdf = kernel_data.background.map_weight; + + float portal_pdf = 0.0f; + /* Portals are a special case here since we need to compute their pdf in order + * to find out if we can sample them. */ + if (portal_method_pdf > 0.0f) { + /* Evaluate PDF of sampling this direction by portal sampling. */ + bool is_possible = false; + portal_pdf = background_portal_pdf(kg, P, direction, -1, &is_possible); + if (!is_possible) { + /* Portal sampling is not possible here because all portals point to the wrong side. + * If other methods can be used instead, do so, otherwise uniform sampling is used as a + * fallback. */ + portal_method_pdf = 0.0f; + } + } + + float pdf_fac = (portal_method_pdf + sun_method_pdf + map_method_pdf); + if (pdf_fac == 0.0f) { + /* Use uniform as a fallback if we can't use any strategy. */ + return kernel_data.integrator.pdf_lights / M_4PI_F; + } + + pdf_fac = 1.0f / pdf_fac; + portal_method_pdf *= pdf_fac; + sun_method_pdf *= pdf_fac; + map_method_pdf *= pdf_fac; + + float pdf = portal_pdf * portal_method_pdf; + if (sun_method_pdf != 0.0f) { + pdf += background_sun_pdf(kg, direction) * sun_method_pdf; + } + if (map_method_pdf != 0.0f) { + pdf += background_map_pdf(kg, direction) * map_method_pdf; + } + + return pdf * kernel_data.integrator.pdf_lights; +} + +#endif + +CCL_NAMESPACE_END
\ No newline at end of file diff --git a/intern/cycles/kernel/kernel_light_common.h b/intern/cycles/kernel/kernel_light_common.h new file mode 100644 index 00000000000..39503a4b479 --- /dev/null +++ b/intern/cycles/kernel/kernel_light_common.h @@ -0,0 +1,159 @@ +/* + * Copyright 2011-2020 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* Area light sampling */ + +/* Uses the following paper: + * + * Carlos Urena et al. + * An Area-Preserving Parametrization for Spherical Rectangles. + * + * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf + * + * Note: light_p is modified when sample_coord is true. + */ +ccl_device_inline float rect_light_sample(float3 P, + float3 *light_p, + float3 axisu, + float3 axisv, + float randu, + float randv, + bool sample_coord) +{ + /* In our name system we're using P for the center, + * which is o in the paper. + */ + + float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f; + float axisu_len, axisv_len; + /* Compute local reference system R. */ + float3 x = normalize_len(axisu, &axisu_len); + float3 y = normalize_len(axisv, &axisv_len); + float3 z = cross(x, y); + /* Compute rectangle coords in local reference system. */ + float3 dir = corner - P; + float z0 = dot(dir, z); + /* Flip 'z' to make it point against Q. */ + if (z0 > 0.0f) { + z *= -1.0f; + z0 *= -1.0f; + } + float x0 = dot(dir, x); + float y0 = dot(dir, y); + float x1 = x0 + axisu_len; + float y1 = y0 + axisv_len; + /* Compute internal angles (gamma_i). */ + float4 diff = make_float4(x0, y1, x1, y0) - make_float4(x1, y0, x0, y1); + float4 nz = make_float4(y0, x1, y1, x0) * diff; + nz = nz / sqrt(z0 * z0 * diff * diff + nz * nz); + float g0 = safe_acosf(-nz.x * nz.y); + float g1 = safe_acosf(-nz.y * nz.z); + float g2 = safe_acosf(-nz.z * nz.w); + float g3 = safe_acosf(-nz.w * nz.x); + /* Compute predefined constants. */ + float b0 = nz.x; + float b1 = nz.z; + float b0sq = b0 * b0; + float k = M_2PI_F - g2 - g3; + /* Compute solid angle from internal angles. */ + float S = g0 + g1 - k; + + if (sample_coord) { + /* Compute cu. */ + float au = randu * S + k; + float fu = (cosf(au) * b0 - b1) / sinf(au); + float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f); + cu = clamp(cu, -1.0f, 1.0f); + /* Compute xu. */ + float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f); + xu = clamp(xu, x0, x1); + /* Compute yv. */ + float z0sq = z0 * z0; + float y0sq = y0 * y0; + float y1sq = y1 * y1; + float d = sqrtf(xu * xu + z0sq); + float h0 = y0 / sqrtf(d * d + y0sq); + float h1 = y1 / sqrtf(d * d + y1sq); + float hv = h0 + randv * (h1 - h0), hv2 = hv * hv; + float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1; + + /* Transform (xu, yv, z0) to world coords. */ + *light_p = P + xu * x + yv * y + z0 * z; + } + + /* return pdf */ + if (S != 0.0f) + return 1.0f / S; + else + return 0.0f; +} + +ccl_device_inline float3 ellipse_sample(float3 ru, float3 rv, float randu, float randv) +{ + to_unit_disk(&randu, &randv); + return ru * randu + rv * randv; +} + +ccl_device float3 disk_light_sample(float3 v, float randu, float randv) +{ + float3 ru, rv; + + make_orthonormals(v, &ru, &rv); + + return ellipse_sample(ru, rv, randu, randv); +} + +ccl_device float3 distant_light_sample(float3 D, float radius, float randu, float randv) +{ + return normalize(D + disk_light_sample(D, randu, randv) * radius); +} + +ccl_device float3 +sphere_light_sample(float3 P, float3 center, float radius, float randu, float randv) +{ + return disk_light_sample(normalize(P - center), randu, randv) * radius; +} + +ccl_device float spot_light_attenuation(float3 dir, float spot_angle, float spot_smooth, float3 N) +{ + float attenuation = dot(dir, N); + + if (attenuation <= spot_angle) { + attenuation = 0.0f; + } + else { + float t = attenuation - spot_angle; + + if (t < spot_smooth && spot_smooth != 0.0f) + attenuation *= smoothstepf(t / spot_smooth); + } + + return attenuation; +} + +ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t) +{ + float cos_pi = dot(Ng, I); + + if (cos_pi <= 0.0f) + return 0.0f; + + return t * t / cos_pi; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h index 5c776e06547..0edcc1a5a14 100644 --- a/intern/cycles/kernel/kernel_montecarlo.h +++ b/intern/cycles/kernel/kernel_montecarlo.h @@ -98,6 +98,16 @@ ccl_device_inline void sample_uniform_cone( *pdf = M_1_2PI_F / (1.0f - zMin); } +ccl_device_inline float pdf_uniform_cone(const float3 N, float3 D, float angle) +{ + float zMin = cosf(angle); + float z = dot(N, D); + if (z > zMin) { + return M_1_2PI_F / (1.0f - zMin); + } + return 0.0f; +} + /* sample uniform point on the surface of a sphere */ ccl_device float3 sample_uniform_sphere(float u1, float u2) { diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index 9700aaba80f..3d9f787f267 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -63,10 +63,8 @@ ccl_device_noinline { PROFILING_INIT(kg, PROFILING_SHADER_SETUP); -#ifdef __INSTANCING__ sd->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) : isect->object; -#endif sd->lamp = LAMP_NONE; sd->type = isect->type; @@ -82,18 +80,13 @@ ccl_device_noinline sd->prim = kernel_tex_fetch(__prim_index, isect->prim); sd->ray_length = isect->t; -#ifdef __UV__ sd->u = isect->u; sd->v = isect->v; -#endif #ifdef __HAIR__ if (sd->type & PRIMITIVE_ALL_CURVE) { /* curve */ - float4 curvedata = kernel_tex_fetch(__curves, sd->prim); - - sd->shader = __float_as_int(curvedata.z); - sd->P = curve_refine(kg, sd, isect, ray); + curve_shader_setup(kg, sd, isect, ray); } else #endif @@ -125,17 +118,15 @@ ccl_device_noinline sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags; -#ifdef __INSTANCING__ if (isect->object != OBJECT_NONE) { /* instance transform */ object_normal_transform_auto(kg, sd, &sd->N); object_normal_transform_auto(kg, sd, &sd->Ng); -# ifdef __DPDU__ +#ifdef __DPDU__ object_dir_transform_auto(kg, sd, &sd->dPdu); object_dir_transform_auto(kg, sd, &sd->dPdv); -# endif - } #endif + } /* backfacing test */ bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); @@ -185,10 +176,8 @@ ccl_device_inline sd->prim = kernel_tex_fetch(__prim_index, isect->prim); sd->type = isect->type; -# ifdef __UV__ sd->u = isect->u; sd->v = isect->v; -# endif /* fetch triangle data */ if (sd->type == PRIMITIVE_TRIANGLE) { @@ -215,17 +204,15 @@ ccl_device_inline sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags; -# ifdef __INSTANCING__ if (isect->object != OBJECT_NONE) { /* instance transform */ object_normal_transform_auto(kg, sd, &sd->N); object_normal_transform_auto(kg, sd, &sd->Ng); -# ifdef __DPDU__ +# ifdef __DPDU__ object_dir_transform_auto(kg, sd, &sd->dPdu); object_dir_transform_auto(kg, sd, &sd->dPdv); -# endif - } # endif + } /* backfacing test */ if (backfacing) { @@ -284,17 +271,13 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, else sd->type = PRIMITIVE_NONE; - /* primitive */ -#ifdef __INSTANCING__ + /* primitive */ sd->object = object; -#endif sd->lamp = LAMP_NONE; /* currently no access to bvh prim index for strand sd->prim*/ sd->prim = prim; -#ifdef __UV__ sd->u = u; sd->v = v; -#endif sd->time = time; sd->ray_length = t; @@ -330,23 +313,19 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, if (sd->shader & SHADER_SMOOTH_NORMAL) { sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); -#ifdef __INSTANCING__ if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { object_normal_transform_auto(kg, sd, &sd->N); } -#endif } /* dPdu/dPdv */ #ifdef __DPDU__ triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); -# ifdef __INSTANCING__ if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { object_dir_transform_auto(kg, sd, &sd->dPdu); object_dir_transform_auto(kg, sd, &sd->dPdv); } -# endif #endif } else { @@ -432,15 +411,11 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, sd->time = ray->time; sd->ray_length = 0.0f; -#ifdef __INSTANCING__ sd->object = OBJECT_NONE; -#endif sd->lamp = LAMP_NONE; sd->prim = PRIM_NONE; -#ifdef __UV__ sd->u = 0.0f; sd->v = 0.0f; -#endif #ifdef __DPDU__ /* dPdu/dPdv */ @@ -481,17 +456,13 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s sd->time = ray->time; sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */ -# ifdef __INSTANCING__ sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */ -# endif sd->lamp = LAMP_NONE; sd->prim = PRIM_NONE; sd->type = PRIMITIVE_NONE; -# ifdef __UV__ sd->u = 0.0f; sd->v = 0.0f; -# endif # ifdef __DPDU__ /* dPdu/dPdv */ diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 0a0cf1bd6c0..fc9cc73a704 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -84,9 +84,7 @@ CCL_NAMESPACE_BEGIN /* Kernel features */ #define __SOBOL__ -#define __INSTANCING__ #define __DPDU__ -#define __UV__ #define __BACKGROUND__ #define __CAUSTICS_TRICKS__ #define __VISIBILITY_FLAG__ @@ -125,9 +123,6 @@ CCL_NAMESPACE_BEGIN /* Device specific features */ #ifdef __KERNEL_CPU__ -# ifdef __KERNEL_SSE2__ -# define __QBVH__ -# endif # ifdef WITH_OSL # define __OSL__ # endif @@ -696,27 +691,38 @@ typedef enum PrimitiveType { PRIMITIVE_NONE = 0, PRIMITIVE_TRIANGLE = (1 << 0), PRIMITIVE_MOTION_TRIANGLE = (1 << 1), - PRIMITIVE_CURVE = (1 << 2), - PRIMITIVE_MOTION_CURVE = (1 << 3), + PRIMITIVE_CURVE_THICK = (1 << 2), + PRIMITIVE_MOTION_CURVE_THICK = (1 << 3), + PRIMITIVE_CURVE_RIBBON = (1 << 4), + PRIMITIVE_MOTION_CURVE_RIBBON = (1 << 5), /* Lamp primitive is not included below on purpose, * since it is no real traceable primitive. */ - PRIMITIVE_LAMP = (1 << 4), + PRIMITIVE_LAMP = (1 << 6), PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE | PRIMITIVE_MOTION_TRIANGLE), - PRIMITIVE_ALL_CURVE = (PRIMITIVE_CURVE | PRIMITIVE_MOTION_CURVE), - PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE | PRIMITIVE_MOTION_CURVE), + PRIMITIVE_ALL_CURVE = (PRIMITIVE_CURVE_THICK | PRIMITIVE_MOTION_CURVE_THICK | + PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON), + PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE | PRIMITIVE_MOTION_CURVE_THICK | + PRIMITIVE_MOTION_CURVE_RIBBON), PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE), /* Total number of different traceable primitives. * NOTE: This is an actual value, not a bitflag. */ - PRIMITIVE_NUM_TOTAL = 4, + PRIMITIVE_NUM_TOTAL = 6, } PrimitiveType; #define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM_TOTAL) | (type)) #define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM_TOTAL) +typedef enum CurveShapeType { + CURVE_RIBBON = 0, + CURVE_THICK = 1, + + CURVE_NUM_SHAPE_TYPES, +} CurveShapeType; + /* Attributes */ typedef enum AttributePrimitive { @@ -1291,6 +1297,24 @@ typedef struct KernelBackground { float ao_factor; float ao_distance; float ao_bounces_factor; + + /* portal sampling */ + float portal_weight; + int num_portals; + int portal_offset; + + /* sun sampling */ + float sun_weight; + /* xyz store direction, w the angle. float4 instead of float3 is used + * to ensure consistent padding/alignment across devices. */ + float4 sun; + + /* map sampling */ + float map_weight; + int map_res_x; + int map_res_y; + + int use_mis; } KernelBackground; static_assert_align(KernelBackground, 16); @@ -1302,15 +1326,8 @@ typedef struct KernelIntegrator { int num_all_lights; float pdf_triangles; float pdf_lights; - int pdf_background_res_x; - int pdf_background_res_y; float light_inv_rr_threshold; - /* light portals */ - float portal_pdf; - int num_portals; - int portal_offset; - /* bounces */ int min_bounce; int max_bounce; @@ -1372,7 +1389,7 @@ typedef struct KernelIntegrator { int max_closures; - int pad1; + int pad1, pad2; } KernelIntegrator; static_assert_align(KernelIntegrator, 16); @@ -1380,13 +1397,11 @@ typedef enum KernelBVHLayout { BVH_LAYOUT_NONE = 0, BVH_LAYOUT_BVH2 = (1 << 0), - BVH_LAYOUT_BVH4 = (1 << 1), - BVH_LAYOUT_BVH8 = (1 << 2), + BVH_LAYOUT_EMBREE = (1 << 1), + BVH_LAYOUT_OPTIX = (1 << 2), - BVH_LAYOUT_EMBREE = (1 << 3), - BVH_LAYOUT_OPTIX = (1 << 4), - - BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH8, + /* Default BVH layout to use for CPU. */ + BVH_LAYOUT_AUTO = BVH_LAYOUT_EMBREE, BVH_LAYOUT_ALL = (unsigned int)(~0u), } KernelBVHLayout; @@ -1395,9 +1410,9 @@ typedef struct KernelBVH { int root; int have_motion; int have_curves; - int have_instancing; int bvh_layout; int use_bvh_steps; + int curve_subdivisions; /* Custom BVH */ #ifdef __KERNEL_OPTIX__ @@ -1415,25 +1430,6 @@ typedef struct KernelBVH { } KernelBVH; static_assert_align(KernelBVH, 16); -typedef enum CurveFlag { - /* runtime flags */ - CURVE_KN_BACKFACING = 1, /* backside of cylinder? */ - CURVE_KN_ENCLOSEFILTER = 2, /* don't consider strands surrounding start point? */ - CURVE_KN_INTERPOLATE = 4, /* render as a curve? */ - CURVE_KN_ACCURATE = 8, /* use accurate intersections test? */ - CURVE_KN_INTERSECTCORRECTION = 16, /* correct for width after determing closest midpoint? */ - CURVE_KN_TRUETANGENTGNORMAL = 32, /* use tangent normal for geometry? */ - CURVE_KN_RIBBONS = 64, /* use flat curve ribbons */ -} CurveFlag; - -typedef struct KernelCurves { - int curveflags; - int subdivisions; - - int pad1, pad2; -} KernelCurves; -static_assert_align(KernelCurves, 16); - typedef struct KernelTables { int beckmann_offset; int pad1, pad2, pad3; @@ -1454,7 +1450,6 @@ typedef struct KernelData { KernelBackground background; KernelIntegrator integrator; KernelBVH bvh; - KernelCurves curve; KernelTables tables; KernelBake bake; } KernelData; diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp index 8829a14ead5..8040bfb7b33 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp @@ -64,12 +64,14 @@ CCL_NAMESPACE_BEGIN /* Memory Copy */ -void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size) +void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t) { - if (strcmp(name, "__data") == 0) - memcpy(&kg->__data, host, size); - else + if (strcmp(name, "__data") == 0) { + kg->__data = *(KernelData *)host; + } + else { assert(0); + } } void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size) diff --git a/intern/cycles/kernel/kernels/optix/kernel_optix.cu b/intern/cycles/kernel/kernels/optix/kernel_optix.cu index e03504316ad..c730d952ed4 100644 --- a/intern/cycles/kernel/kernels/optix/kernel_optix.cu +++ b/intern/cycles/kernel/kernels/optix/kernel_optix.cu @@ -256,11 +256,9 @@ extern "C" __global__ void __closesthit__kernel_optix_hit() } #ifdef __HAIR__ -extern "C" __global__ void __intersection__curve() +ccl_device_inline void optix_intersection_curve(const uint prim, const uint type) { - const uint prim = optixGetPrimitiveIndex(); const uint object = get_object_id<true>(); - const uint type = kernel_tex_fetch(__prim_type, prim); const uint visibility = optixGetPayload_4(); float3 P = optixGetObjectRayOrigin(); @@ -282,14 +280,30 @@ extern "C" __global__ void __intersection__curve() if (isect.t != FLT_MAX) isect.t *= len; - if (!(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) ? - curve_intersect(NULL, &isect, P, dir, visibility, object, prim, time, type) : - cardinal_curve_intersect(NULL, &isect, P, dir, visibility, object, prim, time, type)) { + if (curve_intersect(NULL, &isect, P, dir, visibility, object, prim, time, type)) { optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL, __float_as_int(isect.u), // Attribute_0 __float_as_int(isect.v)); // Attribute_1 } + +} + +extern "C" __global__ void __intersection__curve_ribbon() +{ + const uint prim = optixGetPrimitiveIndex(); + const uint type = kernel_tex_fetch(__prim_type, prim); + + if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) { + optix_intersection_curve(prim, type); + } +} + +extern "C" __global__ void __intersection__curve_all() +{ + const uint prim = optixGetPrimitiveIndex(); + const uint type = kernel_tex_fetch(__prim_type, prim); + optix_intersection_curve(prim, type); } #endif diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt index fc0c845fd4f..d7ab778181e 100644 --- a/intern/cycles/kernel/osl/CMakeLists.txt +++ b/intern/cycles/kernel/osl/CMakeLists.txt @@ -36,6 +36,15 @@ set(LIB # OSL and LLVM are built without RTTI set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${RTTI_DISABLE_FLAGS}") +if(APPLE) + # Disable allocation warning on macOS prior to 10.14: the OSLRenderServices + # contains member which is 64 bytes aligned (cache inside of OIIO's + # unordered_map_concurrent). This is not something what the SDK supportsm, but + # since we take care of allocations ourselves is is OK to ignore the + # diagnostic message. + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-allocation") +endif() + include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp index 872a55143cc..7ee467a46dd 100644 --- a/intern/cycles/kernel/osl/osl_closures.cpp +++ b/intern/cycles/kernel/osl/osl_closures.cpp @@ -362,6 +362,9 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) id++, closure_bsdf_transparent_params(), closure_bsdf_transparent_prepare); + + register_closure( + ss, "microfacet", id++, closure_bsdf_microfacet_params(), closure_bsdf_microfacet_prepare); register_closure(ss, "microfacet_ggx", id++, @@ -508,6 +511,82 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering) return false; } +/* Standard Microfacet Closure */ + +class MicrofacetClosure : public CBSDFClosure { + public: + MicrofacetBsdf params; + ustring distribution; + int refract; + + void setup(ShaderData *sd, int path_flag, float3 weight) + { + static ustring u_ggx("ggx"); + static ustring u_default("default"); + + const int label = (refract) ? LABEL_TRANSMIT : LABEL_REFLECT; + if (skip(sd, path_flag, LABEL_GLOSSY | label)) { + return; + } + + MicrofacetBsdf *bsdf = (MicrofacetBsdf *)bsdf_alloc_osl( + sd, sizeof(MicrofacetBsdf), weight, ¶ms); + + if (!bsdf) { + return; + } + + /* GGX */ + if (distribution == u_ggx || distribution == u_default) { + if (!refract) { + if (params.alpha_x == params.alpha_y) { + /* Isotropic */ + sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf); + } + else { + /* Anisotropic */ + sd->flag |= bsdf_microfacet_ggx_setup(bsdf); + } + } + else { + sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf); + } + } + /* Beckmann */ + else { + if (!refract) { + if (params.alpha_x == params.alpha_y) { + /* Isotropic */ + sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf); + } + else { + /* Anisotropic */ + sd->flag |= bsdf_microfacet_beckmann_setup(bsdf); + } + } + else { + sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf); + } + } + } +}; + +ClosureParam *closure_bsdf_microfacet_params() +{ + static ClosureParam params[] = {CLOSURE_STRING_PARAM(MicrofacetClosure, distribution), + CLOSURE_FLOAT3_PARAM(MicrofacetClosure, params.N), + CLOSURE_FLOAT3_PARAM(MicrofacetClosure, params.T), + CLOSURE_FLOAT_PARAM(MicrofacetClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetClosure, params.alpha_y), + CLOSURE_FLOAT_PARAM(MicrofacetClosure, params.ior), + CLOSURE_INT_PARAM(MicrofacetClosure, refract), + CLOSURE_STRING_KEYPARAM(MicrofacetClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetClosure)}; + + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_prepare, MicrofacetClosure) + /* GGX closures with Fresnel */ class MicrofacetFresnelClosure : public CBSDFClosure { diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h index d12afdb80dd..e4058e3a746 100644 --- a/intern/cycles/kernel/osl/osl_closures.h +++ b/intern/cycles/kernel/osl/osl_closures.h @@ -51,6 +51,7 @@ OSL::ClosureParam *closure_bsdf_transparent_params(); OSL::ClosureParam *closure_bssrdf_params(); OSL::ClosureParam *closure_absorption_params(); OSL::ClosureParam *closure_henyey_greenstein_params(); +OSL::ClosureParam *closure_bsdf_microfacet_params(); OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_params(); OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params(); OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_params(); @@ -70,6 +71,7 @@ void closure_bsdf_transparent_prepare(OSL::RendererServices *, int id, void *dat void closure_bssrdf_prepare(OSL::RendererServices *, int id, void *data); void closure_absorption_prepare(OSL::RendererServices *, int id, void *data); void closure_henyey_greenstein_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_microfacet_multi_ggx_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_microfacet_multi_ggx_glass_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_microfacet_multi_ggx_aniso_prepare(OSL::RendererServices *, int id, void *data); diff --git a/intern/cycles/kernel/shaders/node_sky_texture.osl b/intern/cycles/kernel/shaders/node_sky_texture.osl index 4def237a2e0..08bc8f85120 100644 --- a/intern/cycles/kernel/shaders/node_sky_texture.osl +++ b/intern/cycles/kernel/shaders/node_sky_texture.osl @@ -44,13 +44,13 @@ float sky_perez_function(float lam[9], float theta, float gamma) (1.0 + lam[2] * exp(lam[3] * gamma) + lam[4] * cgamma * cgamma); } -color sky_radiance_old(normal dir, - float sunphi, - float suntheta, - color radiance, - float config_x[9], - float config_y[9], - float config_z[9]) +color sky_radiance_preetham(normal dir, + float sunphi, + float suntheta, + color radiance, + float config_x[9], + float config_y[9], + float config_z[9]) { /* convert vector to spherical coordinates */ vector spherical = sky_spherical_coordinates(dir); @@ -88,13 +88,13 @@ float sky_radiance_internal(float config[9], float theta, float gamma) (config[2] + config[3] * expM + config[5] * rayM + config[6] * mieM + config[7] * zenith); } -color sky_radiance_new(normal dir, - float sunphi, - float suntheta, - color radiance, - float config_x[9], - float config_y[9], - float config_z[9]) +color sky_radiance_hosek(normal dir, + float sunphi, + float suntheta, + color radiance, + float config_x[9], + float config_y[9], + float config_z[9]) { /* convert vector to spherical coordinates */ vector spherical = sky_spherical_coordinates(dir); @@ -116,16 +116,103 @@ color sky_radiance_new(normal dir, return xyz_to_rgb(x, y, z) * (M_2PI / 683); } +/* Nishita improved */ +vector geographical_to_direction(float lat, float lon) +{ + return vector(cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat)); +} + +color sky_radiance_nishita(vector dir, float nishita_data[9], string filename) +{ + /* definitions */ + float sun_elevation = nishita_data[6]; + float sun_rotation = nishita_data[7]; + float angular_diameter = nishita_data[8]; + int sun_disc = angular_diameter > 0; + float alpha = 1.0; + color xyz; + /* convert dir to spherical coordinates */ + vector direction = sky_spherical_coordinates(dir); + + /* render above the horizon */ + if (dir[2] >= 0.0) { + /* definitions */ + vector sun_dir = geographical_to_direction(sun_elevation, sun_rotation + M_PI_2); + float sun_dir_angle = acos(dot(dir, sun_dir)); + float half_angular = angular_diameter / 2.0; + float dir_elevation = M_PI_2 - direction[0]; + + /* if ray inside sun disc render it, otherwise render sky */ + if (sun_dir_angle < half_angular && sun_disc == 1) { + /* get 3 pixels data */ + color pixel_bottom = color(nishita_data[0], nishita_data[1], nishita_data[2]); + color pixel_top = color(nishita_data[3], nishita_data[4], nishita_data[5]); + float y; + + /* sun interpolation */ + if (sun_elevation - half_angular > 0.0) { + if ((sun_elevation + half_angular) > 0.0) { + y = ((dir_elevation - sun_elevation) / angular_diameter) + 0.5; + xyz = mix(pixel_bottom, pixel_top, y); + } + } + else { + if (sun_elevation + half_angular > 0.0) { + y = dir_elevation / (sun_elevation + half_angular); + xyz = mix(pixel_bottom, pixel_top, y); + } + } + /* limb darkening, coefficient is 0.6f */ + float angle_fraction = sun_dir_angle / half_angular; + float limb_darkening = (1.0 - 0.6 * (1.0 - sqrt(1.0 - angle_fraction * angle_fraction))); + xyz *= limb_darkening; + } + /* sky */ + else { + /* sky interpolation */ + float x = (direction[1] + M_PI + sun_rotation) / M_2PI; + float y = 1.0 - (dir_elevation / M_PI_2); + if (x > 1.0) { + x = x - 1.0; + } + xyz = (color)texture(filename, x, y, "wrap", "clamp", "interp", "linear", "alpha", alpha); + } + } + /* ground */ + else { + if (dir[2] < -0.4) { + xyz = color(0, 0, 0); + } + else { + /* black ground fade */ + float mul = pow(1.0 + dir[2] * 2.5, 3.0); + /* interpolation */ + float x = (direction[1] + M_PI + sun_rotation) / M_2PI; + float y = 1.5; + if (x > 1.0) { + x = x - 1.0; + } + xyz = (color)texture( + filename, x, y, "wrap", "periodic", "interp", "linear", "alpha", alpha) * + mul; + } + } + /* convert to RGB and adjust strength */ + return xyz_to_rgb(xyz[0], xyz[1], xyz[2]) * 120000.0; +} + shader node_sky_texture(int use_mapping = 0, matrix mapping = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), vector Vector = P, string type = "hosek_wilkie", float theta = 0.0, float phi = 0.0, + string filename = "", color radiance = color(0.0, 0.0, 0.0), float config_x[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, float config_y[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, float config_z[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + float nishita_data[9] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, output color Color = color(0.0, 0.0, 0.0)) { vector p = Vector; @@ -133,8 +220,10 @@ shader node_sky_texture(int use_mapping = 0, if (use_mapping) p = transform(mapping, p); + if (type == "nishita_improved") + Color = sky_radiance_nishita(p, nishita_data, filename); if (type == "hosek_wilkie") - Color = sky_radiance_new(p, phi, theta, radiance, config_x, config_y, config_z); - else - Color = sky_radiance_old(p, phi, theta, radiance, config_x, config_y, config_z); + Color = sky_radiance_hosek(p, phi, theta, radiance, config_x, config_y, config_z); + if (type == "preetham") + Color = sky_radiance_preetham(p, phi, theta, radiance, config_x, config_y, config_z); } diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index 2c57a142692..1ae94f1d766 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -847,39 +847,29 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: { float3 weight = sd->svm_closure_weight * mix_weight; - if (sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) { - /* todo: giving a fixed weight here will cause issues when - * mixing multiple BSDFS. energy will not be conserved and - * the throughput can blow up after multiple bounces. we - * better figure out a way to skip backfaces from rays - * spawned by transmission from the front */ - bsdf_transparent_setup(sd, make_float3(1.0f, 1.0f, 1.0f), path_flag); - } - else { - HairBsdf *bsdf = (HairBsdf *)bsdf_alloc(sd, sizeof(HairBsdf), weight); + HairBsdf *bsdf = (HairBsdf *)bsdf_alloc(sd, sizeof(HairBsdf), weight); - if (bsdf) { - bsdf->N = N; - bsdf->roughness1 = param1; - bsdf->roughness2 = param2; - bsdf->offset = -stack_load_float(stack, data_node.z); + if (bsdf) { + bsdf->N = N; + bsdf->roughness1 = param1; + bsdf->roughness2 = param2; + bsdf->offset = -stack_load_float(stack, data_node.z); - if (stack_valid(data_node.y)) { - bsdf->T = normalize(stack_load_float3(stack, data_node.y)); - } - else if (!(sd->type & PRIMITIVE_ALL_CURVE)) { - bsdf->T = normalize(sd->dPdv); - bsdf->offset = 0.0f; - } - else - bsdf->T = normalize(sd->dPdu); + if (stack_valid(data_node.y)) { + bsdf->T = normalize(stack_load_float3(stack, data_node.y)); + } + else if (!(sd->type & PRIMITIVE_ALL_CURVE)) { + bsdf->T = normalize(sd->dPdv); + bsdf->offset = 0.0f; + } + else + bsdf->T = normalize(sd->dPdu); - if (type == CLOSURE_BSDF_HAIR_REFLECTION_ID) { - sd->flag |= bsdf_hair_reflection_setup(bsdf); - } - else { - sd->flag |= bsdf_hair_transmission_setup(bsdf); - } + if (type == CLOSURE_BSDF_HAIR_REFLECTION_ID) { + sd->flag |= bsdf_hair_reflection_setup(bsdf); + } + else { + sd->flag |= bsdf_hair_transmission_setup(bsdf); } } diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h index 019c6294082..77df19b2298 100644 --- a/intern/cycles/kernel/svm/svm_geometry.h +++ b/intern/cycles/kernel/svm/svm_geometry.h @@ -41,11 +41,9 @@ ccl_device_inline void svm_node_geometry( case NODE_GEOM_Ng: data = sd->Ng; break; -#ifdef __UV__ case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break; -#endif default: data = make_float3(0.0f, 0.0f, 0.0f); } diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h index 914ef2089a9..7db8ffcc6e1 100644 --- a/intern/cycles/kernel/svm/svm_noise.h +++ b/intern/cycles/kernel/svm/svm_noise.h @@ -573,8 +573,8 @@ ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f) * * Point Offset from v0 * v0 (0, 0, 0) - * v1 (0, 0, 1) The full avx type is computed by inserting the following - * v2 (0, 1, 0) sse types into both the low and high parts of the avx. + * v1 (0, 0, 1) The full AVX type is computed by inserting the following + * v2 (0, 1, 0) SSE types into both the low and high parts of the AVX. * v3 (0, 1, 1) * v4 (1, 0, 0) * v5 (1, 0, 1) (0, 1, 0, 1) = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(V, V + 1)) diff --git a/intern/cycles/kernel/svm/svm_sky.h b/intern/cycles/kernel/svm/svm_sky.h index 50fe0c8232f..e877bd9a5c8 100644 --- a/intern/cycles/kernel/svm/svm_sky.h +++ b/intern/cycles/kernel/svm/svm_sky.h @@ -37,16 +37,16 @@ ccl_device float sky_perez_function(float *lam, float theta, float gamma) (1.0f + lam[2] * expf(lam[3] * gamma) + lam[4] * cgamma * cgamma); } -ccl_device float3 sky_radiance_old(KernelGlobals *kg, - float3 dir, - float sunphi, - float suntheta, - float radiance_x, - float radiance_y, - float radiance_z, - float *config_x, - float *config_y, - float *config_z) +ccl_device float3 sky_radiance_preetham(KernelGlobals *kg, + float3 dir, + float sunphi, + float suntheta, + float radiance_x, + float radiance_y, + float radiance_z, + float *config_x, + float *config_y, + float *config_z) { /* convert vector to spherical coordinates */ float2 spherical = direction_to_spherical(dir); @@ -90,16 +90,16 @@ ccl_device float sky_radiance_internal(float *configuration, float theta, float configuration[6] * mieM + configuration[7] * zenith); } -ccl_device float3 sky_radiance_new(KernelGlobals *kg, - float3 dir, - float sunphi, - float suntheta, - float radiance_x, - float radiance_y, - float radiance_z, - float *config_x, - float *config_y, - float *config_z) +ccl_device float3 sky_radiance_hosek(KernelGlobals *kg, + float3 dir, + float sunphi, + float suntheta, + float radiance_x, + float radiance_y, + float radiance_z, + float *config_x, + float *config_y, + float *config_z) { /* convert vector to spherical coordinates */ float2 spherical = direction_to_spherical(dir); @@ -121,93 +121,206 @@ ccl_device float3 sky_radiance_new(KernelGlobals *kg, return xyz_to_rgb(kg, make_float3(x, y, z)) * (M_2PI_F / 683); } +/* Nishita improved sky model */ +ccl_device float3 geographical_to_direction(float lat, float lon) +{ + return make_float3(cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat)); +} + +ccl_device float3 sky_radiance_nishita(KernelGlobals *kg, + float3 dir, + float *nishita_data, + uint texture_id) +{ + /* definitions */ + float sun_elevation = nishita_data[6]; + float sun_rotation = nishita_data[7]; + float angular_diameter = nishita_data[8]; + bool sun_disc = (angular_diameter > 0.0f); + float3 xyz; + /* convert dir to spherical coordinates */ + float2 direction = direction_to_spherical(dir); + + /* render above the horizon */ + if (dir.z >= 0.0f) { + /* definitions */ + float3 sun_dir = geographical_to_direction(sun_elevation, sun_rotation + M_PI_2_F); + float sun_dir_angle = acos(dot(dir, sun_dir)); + float half_angular = angular_diameter / 2.0f; + float dir_elevation = M_PI_2_F - direction.x; + + /* if ray inside sun disc render it, otherwise render sky */ + if (sun_disc && sun_dir_angle < half_angular) { + /* get 3 pixels data */ + float3 pixel_bottom = make_float3(nishita_data[0], nishita_data[1], nishita_data[2]); + float3 pixel_top = make_float3(nishita_data[3], nishita_data[4], nishita_data[5]); + float y; + + /* sun interpolation */ + if (sun_elevation - half_angular > 0.0f) { + if (sun_elevation + half_angular > 0.0f) { + y = ((dir_elevation - sun_elevation) / angular_diameter) + 0.5f; + xyz = interp(pixel_bottom, pixel_top, y); + } + } + else { + if (sun_elevation + half_angular > 0.0f) { + y = dir_elevation / (sun_elevation + half_angular); + xyz = interp(pixel_bottom, pixel_top, y); + } + } + /* limb darkening, coefficient is 0.6f */ + float limb_darkening = (1.0f - + 0.6f * (1.0f - sqrtf(1.0f - sqr(sun_dir_angle / half_angular)))); + xyz *= limb_darkening; + } + /* sky */ + else { + /* sky interpolation */ + float x = (direction.y + M_PI_F + sun_rotation) / M_2PI_F; + float y = dir_elevation / M_PI_2_F; + if (x > 1.0f) { + x -= 1.0f; + } + xyz = float4_to_float3(kernel_tex_image_interp(kg, texture_id, x, y)); + } + } + /* ground */ + else { + if (dir.z < -0.4f) { + xyz = make_float3(0.0f, 0.0f, 0.0f); + } + else { + /* black ground fade */ + float fade = 1.0f + dir.z * 2.5f; + fade = sqr(fade) * fade; + /* interpolation */ + float x = (direction.y + M_PI_F + sun_rotation) / M_2PI_F; + if (x > 1.0f) { + x -= 1.0f; + } + xyz = float4_to_float3(kernel_tex_image_interp(kg, texture_id, x, -0.5)) * fade; + } + } + + /* convert to rgb and adjust strength */ + return xyz_to_rgb(kg, xyz) * 120000.0f; +} + ccl_device void svm_node_tex_sky( KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) { - /* Define variables */ - float sunphi, suntheta, radiance_x, radiance_y, radiance_z; - float config_x[9], config_y[9], config_z[9]; - /* Load data */ uint dir_offset = node.y; uint out_offset = node.z; int sky_model = node.w; - float4 data = read_node_float(kg, offset); - sunphi = data.x; - suntheta = data.y; - radiance_x = data.z; - radiance_y = data.w; - - data = read_node_float(kg, offset); - radiance_z = data.x; - config_x[0] = data.y; - config_x[1] = data.z; - config_x[2] = data.w; - - data = read_node_float(kg, offset); - config_x[3] = data.x; - config_x[4] = data.y; - config_x[5] = data.z; - config_x[6] = data.w; - - data = read_node_float(kg, offset); - config_x[7] = data.x; - config_x[8] = data.y; - config_y[0] = data.z; - config_y[1] = data.w; - - data = read_node_float(kg, offset); - config_y[2] = data.x; - config_y[3] = data.y; - config_y[4] = data.z; - config_y[5] = data.w; - - data = read_node_float(kg, offset); - config_y[6] = data.x; - config_y[7] = data.y; - config_y[8] = data.z; - config_z[0] = data.w; - - data = read_node_float(kg, offset); - config_z[1] = data.x; - config_z[2] = data.y; - config_z[3] = data.z; - config_z[4] = data.w; - - data = read_node_float(kg, offset); - config_z[5] = data.x; - config_z[6] = data.y; - config_z[7] = data.z; - config_z[8] = data.w; - float3 dir = stack_load_float3(stack, dir_offset); float3 f; - /* Compute Sky */ - if (sky_model == 0) { - f = sky_radiance_old(kg, - dir, - sunphi, - suntheta, - radiance_x, - radiance_y, - radiance_z, - config_x, - config_y, - config_z); + /* Preetham and Hosek share the same data */ + if (sky_model == 0 || sky_model == 1) { + /* Define variables */ + float sunphi, suntheta, radiance_x, radiance_y, radiance_z; + float config_x[9], config_y[9], config_z[9]; + + float4 data = read_node_float(kg, offset); + sunphi = data.x; + suntheta = data.y; + radiance_x = data.z; + radiance_y = data.w; + + data = read_node_float(kg, offset); + radiance_z = data.x; + config_x[0] = data.y; + config_x[1] = data.z; + config_x[2] = data.w; + + data = read_node_float(kg, offset); + config_x[3] = data.x; + config_x[4] = data.y; + config_x[5] = data.z; + config_x[6] = data.w; + + data = read_node_float(kg, offset); + config_x[7] = data.x; + config_x[8] = data.y; + config_y[0] = data.z; + config_y[1] = data.w; + + data = read_node_float(kg, offset); + config_y[2] = data.x; + config_y[3] = data.y; + config_y[4] = data.z; + config_y[5] = data.w; + + data = read_node_float(kg, offset); + config_y[6] = data.x; + config_y[7] = data.y; + config_y[8] = data.z; + config_z[0] = data.w; + + data = read_node_float(kg, offset); + config_z[1] = data.x; + config_z[2] = data.y; + config_z[3] = data.z; + config_z[4] = data.w; + + data = read_node_float(kg, offset); + config_z[5] = data.x; + config_z[6] = data.y; + config_z[7] = data.z; + config_z[8] = data.w; + + /* Compute Sky */ + if (sky_model == 0) { + f = sky_radiance_preetham(kg, + dir, + sunphi, + suntheta, + radiance_x, + radiance_y, + radiance_z, + config_x, + config_y, + config_z); + } + else { + f = sky_radiance_hosek(kg, + dir, + sunphi, + suntheta, + radiance_x, + radiance_y, + radiance_z, + config_x, + config_y, + config_z); + } } + /* Nishita */ else { - f = sky_radiance_new(kg, - dir, - sunphi, - suntheta, - radiance_x, - radiance_y, - radiance_z, - config_x, - config_y, - config_z); + /* Define variables */ + float nishita_data[9]; + + float4 data = read_node_float(kg, offset); + nishita_data[0] = data.x; + nishita_data[1] = data.y; + nishita_data[2] = data.z; + nishita_data[3] = data.w; + + data = read_node_float(kg, offset); + nishita_data[4] = data.x; + nishita_data[5] = data.y; + nishita_data[6] = data.z; + nishita_data[7] = data.w; + + data = read_node_float(kg, offset); + nishita_data[8] = data.x; + uint texture_id = __float_as_uint(data.y); + + /* Compute Sky */ + f = sky_radiance_nishita(kg, dir, nishita_data, texture_id); } stack_store_float3(stack, out_offset, f); diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h index e913d9e0489..f1ebb37e23e 100644 --- a/intern/cycles/kernel/svm/svm_types.h +++ b/intern/cycles/kernel/svm/svm_types.h @@ -414,7 +414,7 @@ typedef enum NodeWaveProfile { NODE_WAVE_PROFILE_TRI, } NodeWaveProfile; -typedef enum NodeSkyType { NODE_SKY_OLD, NODE_SKY_NEW } NodeSkyType; +typedef enum NodeSkyType { NODE_SKY_PREETHAM, NODE_SKY_HOSEK, NODE_SKY_NISHITA } NodeSkyType; typedef enum NodeGradientType { NODE_BLEND_LINEAR, diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt index 472b5a0c101..e37a0407976 100644 --- a/intern/cycles/render/CMakeLists.txt +++ b/intern/cycles/render/CMakeLists.txt @@ -24,6 +24,7 @@ set(SRC hair.cpp image.cpp image_oiio.cpp + image_sky.cpp image_vdb.cpp integrator.cpp jitter.cpp @@ -64,6 +65,7 @@ set(SRC_HEADERS hair.h image.h image_oiio.h + image_sky.h image_vdb.h integrator.h light.h diff --git a/intern/cycles/render/curves.cpp b/intern/cycles/render/curves.cpp index 1907bb33d06..db48d8b6430 100644 --- a/intern/cycles/render/curves.cpp +++ b/intern/cycles/render/curves.cpp @@ -36,13 +36,12 @@ void curvebounds(float *lower, float *upper, float3 *p, int dim) float *p2 = &p[2].x; float *p3 = &p[3].x; - float fc = 0.71f; + /* Catmull-Rom weights. */ float curve_coef[4]; curve_coef[0] = p1[dim]; - curve_coef[1] = -fc * p0[dim] + fc * p2[dim]; - curve_coef[2] = 2.0f * fc * p0[dim] + (fc - 3.0f) * p1[dim] + (3.0f - 2.0f * fc) * p2[dim] - - fc * p3[dim]; - curve_coef[3] = -fc * p0[dim] + (2.0f - fc) * p1[dim] + (fc - 2.0f) * p2[dim] + fc * p3[dim]; + curve_coef[1] = 0.5f * (-p0[dim] + p2[dim]); + curve_coef[2] = 0.5f * (2 * p0[dim] - 5 * p1[dim] + 4 * p2[dim] - p3[dim]); + curve_coef[3] = 0.5f * (-p0[dim] + 3 * p1[dim] - 3 * p2[dim] + p3[dim]); float discroot = curve_coef[2] * curve_coef[2] - 3 * curve_coef[3] * curve_coef[1]; float ta = -1.0f; @@ -77,105 +76,4 @@ void curvebounds(float *lower, float *upper, float3 *p, int dim) *lower = min(*lower, min(exa, exb)); } -/* Hair System Manager */ - -CurveSystemManager::CurveSystemManager() -{ - primitive = CURVE_LINE_SEGMENTS; - curve_shape = CURVE_THICK; - line_method = CURVE_CORRECTED; - triangle_method = CURVE_CAMERA_TRIANGLES; - resolution = 3; - subdivisions = 3; - - use_curves = true; - use_encasing = true; - use_backfacing = false; - use_tangent_normal_geometry = false; - - need_update = true; - need_mesh_update = false; -} - -CurveSystemManager::~CurveSystemManager() -{ -} - -void CurveSystemManager::device_update(Device *device, - DeviceScene *dscene, - Scene * /*scene*/, - Progress &progress) -{ - if (!need_update) - return; - - device_free(device, dscene); - - progress.set_status("Updating Hair settings", "Copying Hair settings to device"); - - KernelCurves *kcurve = &dscene->data.curve; - - kcurve->curveflags = 0; - - if (use_curves) { - if (primitive == CURVE_SEGMENTS || primitive == CURVE_RIBBONS) - kcurve->curveflags |= CURVE_KN_INTERPOLATE; - if (primitive == CURVE_RIBBONS) - kcurve->curveflags |= CURVE_KN_RIBBONS; - - if (line_method == CURVE_ACCURATE) - kcurve->curveflags |= CURVE_KN_ACCURATE; - else if (line_method == CURVE_CORRECTED) - kcurve->curveflags |= CURVE_KN_INTERSECTCORRECTION; - - if (use_tangent_normal_geometry) - kcurve->curveflags |= CURVE_KN_TRUETANGENTGNORMAL; - if (use_backfacing) - kcurve->curveflags |= CURVE_KN_BACKFACING; - if (use_encasing) - kcurve->curveflags |= CURVE_KN_ENCLOSEFILTER; - - kcurve->subdivisions = subdivisions; - } - - if (progress.get_cancel()) - return; - - need_update = false; -} - -void CurveSystemManager::device_free(Device * /*device*/, DeviceScene * /*dscene*/) -{ -} - -bool CurveSystemManager::modified(const CurveSystemManager &CurveSystemManager) -{ - return !( - curve_shape == CurveSystemManager.curve_shape && - line_method == CurveSystemManager.line_method && primitive == CurveSystemManager.primitive && - use_encasing == CurveSystemManager.use_encasing && - use_tangent_normal_geometry == CurveSystemManager.use_tangent_normal_geometry && - use_backfacing == CurveSystemManager.use_backfacing && - triangle_method == CurveSystemManager.triangle_method && - resolution == CurveSystemManager.resolution && use_curves == CurveSystemManager.use_curves && - subdivisions == CurveSystemManager.subdivisions); -} - -bool CurveSystemManager::modified_mesh(const CurveSystemManager &CurveSystemManager) -{ - return !( - primitive == CurveSystemManager.primitive && curve_shape == CurveSystemManager.curve_shape && - triangle_method == CurveSystemManager.triangle_method && - resolution == CurveSystemManager.resolution && use_curves == CurveSystemManager.use_curves); -} - -void CurveSystemManager::tag_update(Scene * /*scene*/) -{ - need_update = true; -} - -void CurveSystemManager::tag_update_mesh() -{ - need_mesh_update = true; -} CCL_NAMESPACE_END diff --git a/intern/cycles/render/curves.h b/intern/cycles/render/curves.h index ade289a402e..c52fcb9c882 100644 --- a/intern/cycles/render/curves.h +++ b/intern/cycles/render/curves.h @@ -20,6 +20,8 @@ #include "util/util_array.h" #include "util/util_types.h" +#include "render/hair.h" + CCL_NAMESPACE_BEGIN class Device; @@ -29,33 +31,6 @@ class Scene; void curvebounds(float *lower, float *upper, float3 *p, int dim); -typedef enum CurvePrimitiveType { - CURVE_TRIANGLES = 0, - CURVE_LINE_SEGMENTS = 1, - CURVE_SEGMENTS = 2, - CURVE_RIBBONS = 3, - - CURVE_NUM_PRIMITIVE_TYPES, -} CurvePrimitiveType; - -typedef enum CurveShapeType { - CURVE_RIBBON = 0, - CURVE_THICK = 1, - - CURVE_NUM_SHAPE_TYPES, -} CurveShapeType; - -typedef enum CurveTriangleMethod { - CURVE_CAMERA_TRIANGLES, - CURVE_TESSELATED_TRIANGLES -} CurveTriangleMethod; - -typedef enum CurveLineMethod { - CURVE_ACCURATE, - CURVE_CORRECTED, - CURVE_UNCORRECTED -} CurveLineMethod; - class ParticleCurveData { public: @@ -75,43 +50,12 @@ class ParticleCurveData { array<int> curve_keynum; array<float> curve_length; array<float2> curve_uv; - array<float3> curve_vcol; + array<float4> curve_vcol; array<float3> curvekey_co; array<float> curvekey_time; }; -/* HairSystem Manager */ - -class CurveSystemManager { - public: - CurvePrimitiveType primitive; - CurveShapeType curve_shape; - CurveLineMethod line_method; - CurveTriangleMethod triangle_method; - int resolution; - int subdivisions; - - bool use_curves; - bool use_encasing; - bool use_backfacing; - bool use_tangent_normal_geometry; - - bool need_update; - bool need_mesh_update; - - CurveSystemManager(); - ~CurveSystemManager(); - - void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress); - void device_free(Device *device, DeviceScene *dscene); - bool modified(const CurveSystemManager &CurveSystemManager); - bool modified_mesh(const CurveSystemManager &CurveSystemManager); - - void tag_update(Scene *scene); - void tag_update_mesh(); -}; - CCL_NAMESPACE_END #endif /* __CURVES_H__ */ diff --git a/intern/cycles/render/denoising.cpp b/intern/cycles/render/denoising.cpp index 4d819d1119e..4055bc4773b 100644 --- a/intern/cycles/render/denoising.cpp +++ b/intern/cycles/render/denoising.cpp @@ -21,6 +21,7 @@ #include "util/util_foreach.h" #include "util/util_map.h" #include "util/util_system.h" +#include "util/util_task.h" #include "util/util_time.h" #include <OpenImageIO/filesystem.h> @@ -377,8 +378,9 @@ void DenoiseTask::create_task(DeviceTask &task) /* Denoising parameters. */ task.denoising = denoiser->params; - task.denoising_do_filter = true; - task.denoising_write_passes = false; + task.denoising.type = DENOISER_NLM; + task.denoising.use = true; + task.denoising.store_passes = false; task.denoising_from_render = false; task.denoising_frames.resize(neighbor_frames.size()); diff --git a/intern/cycles/render/geometry.cpp b/intern/cycles/render/geometry.cpp index d46ed430c4f..3d1b6e1d865 100644 --- a/intern/cycles/render/geometry.cpp +++ b/intern/cycles/render/geometry.cpp @@ -16,10 +16,9 @@ #include "bvh/bvh.h" #include "bvh/bvh_build.h" +#include "bvh/bvh_embree.h" -#ifdef WITH_EMBREE -# include "bvh/bvh_embree.h" -#endif +#include "device/device.h" #include "render/attribute.h" #include "render/camera.h" @@ -212,8 +211,7 @@ void Geometry::compute_bvh( bparams.num_motion_triangle_steps = params->num_bvh_time_steps; bparams.num_motion_curve_steps = params->num_bvh_time_steps; bparams.bvh_type = params->bvh_type; - bparams.curve_flags = dscene->data.curve.curveflags; - bparams.curve_subdivisions = dscene->data.curve.subdivisions; + bparams.curve_subdivisions = params->curve_subdivisions(); delete bvh; bvh = BVH::create(bparams, geometry, objects); @@ -1027,28 +1025,18 @@ void GeometryManager::device_update_bvh(Device *device, bparams.num_motion_triangle_steps = scene->params.num_bvh_time_steps; bparams.num_motion_curve_steps = scene->params.num_bvh_time_steps; bparams.bvh_type = scene->params.bvh_type; - bparams.curve_flags = dscene->data.curve.curveflags; - bparams.curve_subdivisions = dscene->data.curve.subdivisions; + bparams.curve_subdivisions = scene->params.curve_subdivisions(); VLOG(1) << "Using " << bvh_layout_name(bparams.bvh_layout) << " layout."; -#ifdef WITH_EMBREE - if (bparams.bvh_layout == BVH_LAYOUT_EMBREE) { - if (dscene->data.bvh.scene) { - BVHEmbree::destroy(dscene->data.bvh.scene); - } - } -#endif - BVH *bvh = BVH::create(bparams, scene->geometry, scene->objects); bvh->build(progress, &device->stats); if (progress.get_cancel()) { #ifdef WITH_EMBREE - if (bparams.bvh_layout == BVH_LAYOUT_EMBREE) { - if (dscene->data.bvh.scene) { - BVHEmbree::destroy(dscene->data.bvh.scene); - } + if (dscene->data.bvh.scene) { + BVHEmbree::destroy(dscene->data.bvh.scene); + dscene->data.bvh.scene = NULL; } #endif delete bvh; @@ -1104,6 +1092,7 @@ void GeometryManager::device_update_bvh(Device *device, dscene->data.bvh.root = pack.root_index; dscene->data.bvh.bvh_layout = bparams.bvh_layout; dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0); + dscene->data.bvh.curve_subdivisions = scene->params.curve_subdivisions(); bvh->copy_to_device(progress, dscene); @@ -1146,6 +1135,12 @@ void GeometryManager::device_update_preprocess(Device *device, Scene *scene, Pro create_volume_mesh(mesh, progress); } } + + if (geom->type == Geometry::HAIR) { + /* Set curve shape, still a global scene setting for now. */ + Hair *hair = static_cast<Hair *>(geom); + hair->curve_shape = scene->params.hair_shape; + } } need_flags_update = false; @@ -1413,6 +1408,14 @@ void GeometryManager::device_update(Device *device, void GeometryManager::device_free(Device *device, DeviceScene *dscene) { +#ifdef WITH_EMBREE + if (dscene->data.bvh.scene) { + if (dscene->data.bvh.bvh_layout == BVH_LAYOUT_EMBREE) + BVHEmbree::destroy(dscene->data.bvh.scene); + dscene->data.bvh.scene = NULL; + } +#endif + dscene->bvh_nodes.free(); dscene->bvh_leaf_nodes.free(); dscene->object_node.free(); diff --git a/intern/cycles/render/hair.cpp b/intern/cycles/render/hair.cpp index 3daa4cc1e35..816c15cf4ef 100644 --- a/intern/cycles/render/hair.cpp +++ b/intern/cycles/render/hair.cpp @@ -294,6 +294,7 @@ NODE_DEFINE(Hair) Hair::Hair() : Geometry(node_type, Geometry::HAIR) { curvekey_offset = 0; + curve_shape = CURVE_RIBBON; } Hair::~Hair() diff --git a/intern/cycles/render/hair.h b/intern/cycles/render/hair.h index 79f77a78753..39d6a34d799 100644 --- a/intern/cycles/render/hair.h +++ b/intern/cycles/render/hair.h @@ -96,6 +96,7 @@ class Hair : public Geometry { /* BVH */ size_t curvekey_offset; + CurveShapeType curve_shape; /* Constructor/Destructor */ Hair(); diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp index 75050b66bf2..8d187814d64 100644 --- a/intern/cycles/render/image.cpp +++ b/intern/cycles/render/image.cpp @@ -27,6 +27,7 @@ #include "util/util_logging.h" #include "util/util_path.h" #include "util/util_progress.h" +#include "util/util_task.h" #include "util/util_texture.h" #include "util/util_unique_ptr.h" diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h index 2000582ce70..fffe7c5152a 100644 --- a/intern/cycles/render/image.h +++ b/intern/cycles/render/image.h @@ -17,7 +17,6 @@ #ifndef __IMAGE_H__ #define __IMAGE_H__ -#include "device/device.h" #include "device/device_memory.h" #include "render/colorspace.h" @@ -31,6 +30,7 @@ CCL_NAMESPACE_BEGIN class Device; +class DeviceInfo; class ImageHandle; class ImageKey; class ImageMetaData; diff --git a/intern/cycles/render/image_sky.cpp b/intern/cycles/render/image_sky.cpp new file mode 100644 index 00000000000..442e1d7941f --- /dev/null +++ b/intern/cycles/render/image_sky.cpp @@ -0,0 +1,91 @@ +/* + * Copyright 2011-2020 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "render/image_sky.h" + +#include "util/util_image.h" +#include "util/util_logging.h" +#include "util/util_path.h" +#include "util/util_sky_model.h" +#include "util/util_task.h" + +CCL_NAMESPACE_BEGIN + +SkyLoader::SkyLoader( + float sun_elevation, int altitude, float air_density, float dust_density, float ozone_density) + : sun_elevation(sun_elevation), + altitude(altitude), + air_density(air_density), + dust_density(dust_density), + ozone_density(ozone_density) +{ +} + +SkyLoader::~SkyLoader(){}; + +bool SkyLoader::load_metadata(ImageMetaData &metadata) +{ + metadata.width = 512; + metadata.height = 128; + metadata.channels = 3; + metadata.depth = 1; + metadata.type = IMAGE_DATA_TYPE_FLOAT4; + metadata.compress_as_srgb = false; + return true; +} + +bool SkyLoader::load_pixels(const ImageMetaData &metadata, + void *pixels, + const size_t /*pixels_size*/, + const bool /*associate_alpha*/) +{ + /* definitions */ + int width = metadata.width; + int height = metadata.height; + float *pixel_data = (float *)pixels; + float altitude_f = (float)altitude; + + /* precompute sky texture */ + const int rows_per_task = divide_up(1024, width); + parallel_for(blocked_range<size_t>(0, height, rows_per_task), + [&](const blocked_range<size_t> &r) { + nishita_skymodel_precompute_texture(pixel_data, + metadata.channels, + r.begin(), + r.end(), + width, + height, + sun_elevation, + altitude_f, + air_density, + dust_density, + ozone_density); + }); + + return true; +} + +string SkyLoader::name() const +{ + return "sky_nishita"; +} + +bool SkyLoader::equals(const ImageLoader & /*other*/) const +{ + return false; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/render/image_sky.h b/intern/cycles/render/image_sky.h new file mode 100644 index 00000000000..cf4a3e8942c --- /dev/null +++ b/intern/cycles/render/image_sky.h @@ -0,0 +1,49 @@ +/* + * Copyright 2011-2020 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "render/image.h" + +CCL_NAMESPACE_BEGIN + +class SkyLoader : public ImageLoader { + private: + float sun_elevation; + int altitude; + float air_density; + float dust_density; + float ozone_density; + + public: + SkyLoader(float sun_elevation, + int altitude, + float air_density, + float dust_density, + float ozone_density); + ~SkyLoader(); + + bool load_metadata(ImageMetaData &metadata) override; + + bool load_pixels(const ImageMetaData &metadata, + void *pixels, + const size_t /*pixels_size*/, + const bool /*associate_alpha*/) override; + + string name() const override; + + bool equals(const ImageLoader & /*other*/) const override; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp index d4beb06e57b..eff416efa2b 100644 --- a/intern/cycles/render/integrator.cpp +++ b/intern/cycles/render/integrator.cpp @@ -29,6 +29,7 @@ #include "util/util_foreach.h" #include "util/util_hash.h" #include "util/util_logging.h" +#include "util/util_task.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp index cb7474017fa..c0615c6217b 100644 --- a/intern/cycles/render/light.cpp +++ b/intern/cycles/render/light.cpp @@ -31,6 +31,7 @@ #include "util/util_logging.h" #include "util/util_path.h" #include "util/util_progress.h" +#include "util/util_task.h" CCL_NAMESPACE_BEGIN @@ -450,6 +451,7 @@ void LightManager::device_update_distribution(Device *, /* update device */ KernelIntegrator *kintegrator = &dscene->data.integrator; + KernelBackground *kbackground = &dscene->data.background; KernelFilm *kfilm = &dscene->data.film; kintegrator->use_direct_light = (totarea > 0.0f); @@ -493,15 +495,18 @@ void LightManager::device_update_distribution(Device *, /* Portals */ if (num_portals > 0) { - kintegrator->portal_offset = light_index; - kintegrator->num_portals = num_portals; - kintegrator->portal_pdf = background_mis ? 0.5f : 1.0f; + kbackground->portal_offset = light_index; + kbackground->num_portals = num_portals; + kbackground->portal_weight = 1.0f; } else { - kintegrator->num_portals = 0; - kintegrator->portal_offset = 0; - kintegrator->portal_pdf = 0.0f; + kbackground->num_portals = 0; + kbackground->portal_offset = 0; + kbackground->portal_weight = 0.0f; } + + /* Map */ + kbackground->map_weight = background_mis ? 1.0f : 0.0f; } else { dscene->light_distribution.free(); @@ -511,9 +516,12 @@ void LightManager::device_update_distribution(Device *, kintegrator->pdf_triangles = 0.0f; kintegrator->pdf_lights = 0.0f; kintegrator->use_lamp_mis = false; - kintegrator->num_portals = 0; - kintegrator->portal_offset = 0; - kintegrator->portal_pdf = 0.0f; + + kbackground->num_portals = 0; + kbackground->portal_offset = 0; + kbackground->portal_weight = 0.0f; + kbackground->sun_weight = 0.0f; + kbackground->map_weight = 0.0f; kfilm->pass_shadow_scale = 1.0f; } @@ -562,7 +570,7 @@ void LightManager::device_update_background(Device *device, Scene *scene, Progress &progress) { - KernelIntegrator *kintegrator = &dscene->data.integrator; + KernelBackground *kbackground = &dscene->data.background; Light *background_light = NULL; /* find background light */ @@ -575,31 +583,79 @@ void LightManager::device_update_background(Device *device, /* no background light found, signal renderer to skip sampling */ if (!background_light || !background_light->is_enabled) { - kintegrator->pdf_background_res_x = 0; - kintegrator->pdf_background_res_y = 0; + kbackground->map_res_x = 0; + kbackground->map_res_y = 0; + kbackground->map_weight = 0.0f; + kbackground->sun_weight = 0.0f; + kbackground->use_mis = (kbackground->portal_weight > 0.0f); return; } progress.set_status("Updating Lights", "Importance map"); - assert(kintegrator->use_direct_light); + assert(dscene->data.integrator.use_direct_light); + + int2 environment_res = make_int2(0, 0); + Shader *shader = scene->background->get_shader(scene); + int num_suns = 0; + foreach (ShaderNode *node, shader->graph->nodes) { + if (node->type == EnvironmentTextureNode::node_type) { + EnvironmentTextureNode *env = (EnvironmentTextureNode *)node; + ImageMetaData metadata; + if (!env->handle.empty()) { + ImageMetaData metadata = env->handle.metadata(); + environment_res.x = max(environment_res.x, metadata.width); + environment_res.y = max(environment_res.y, metadata.height); + } + } + if (node->type == SkyTextureNode::node_type) { + SkyTextureNode *sky = (SkyTextureNode *)node; + if (sky->type == NODE_SKY_NISHITA && sky->sun_disc) { + /* Ensure that the input coordinates aren't transformed before they reach the node. + * If that is the case, the logic used for sampling the sun's location does not work + * and we have to fall back to map-based sampling. */ + const ShaderInput *vec_in = sky->input("Vector"); + if (vec_in && vec_in->link && vec_in->link->parent) { + ShaderNode *vec_src = vec_in->link->parent; + if ((vec_src->type != TextureCoordinateNode::node_type) || + (vec_in->link != vec_src->output("Generated"))) { + environment_res.x = max(environment_res.x, 4096); + environment_res.y = max(environment_res.y, 2048); + continue; + } + } + + float latitude = sky->sun_elevation; + float longitude = M_2PI_F - sky->sun_rotation + M_PI_2_F; + float half_angle = sky->sun_size * 0.5f; + kbackground->sun = make_float4(cosf(latitude) * cosf(longitude), + cosf(latitude) * sinf(longitude), + sinf(latitude), + half_angle); + kbackground->sun_weight = 4.0f; + environment_res.x = max(environment_res.x, 512); + environment_res.y = max(environment_res.y, 256); + num_suns++; + } + } + } + + /* If there's more than one sun, fall back to map sampling instead. */ + if (num_suns != 1) { + kbackground->sun_weight = 0.0f; + environment_res.x = max(environment_res.x, 4096); + environment_res.y = max(environment_res.y, 2048); + } + + /* Enable MIS for background sampling if any strategy is active. */ + kbackground->use_mis = (kbackground->portal_weight + kbackground->map_weight + + kbackground->sun_weight) > 0.0f; /* get the resolution from the light's size (we stuff it in there) */ int2 res = make_int2(background_light->map_resolution, background_light->map_resolution / 2); /* If the resolution isn't set manually, try to find an environment texture. */ if (res.x == 0) { - Shader *shader = scene->background->get_shader(scene); - foreach (ShaderNode *node, shader->graph->nodes) { - if (node->type == EnvironmentTextureNode::node_type) { - EnvironmentTextureNode *env = (EnvironmentTextureNode *)node; - ImageMetaData metadata; - if (!env->handle.empty()) { - ImageMetaData metadata = env->handle.metadata(); - res.x = max(res.x, metadata.width); - res.y = max(res.y, metadata.height); - } - } - } + res = environment_res; if (res.x > 0 && res.y > 0) { VLOG(2) << "Automatically set World MIS resolution to " << res.x << " by " << res.y << "\n"; } @@ -609,8 +665,8 @@ void LightManager::device_update_background(Device *device, res = make_int2(1024, 512); VLOG(2) << "Setting World MIS resolution to default\n"; } - kintegrator->pdf_background_res_x = res.x; - kintegrator->pdf_background_res_y = res.y; + kbackground->map_res_x = res.x; + kbackground->map_res_y = res.y; vector<float3> pixels; shade_background_pixels(device, dscene, res.x, res.y, pixels, progress); @@ -624,29 +680,13 @@ void LightManager::device_update_background(Device *device, float2 *cond_cdf = dscene->light_background_conditional_cdf.alloc(cdf_width * res.y); double time_start = time_dt(); - if (max(res.x, res.y) < 512) { - /* Small enough resolution, faster to do single-threaded. */ - background_cdf(0, res.y, res.x, res.y, &pixels, cond_cdf); - } - else { - /* Threaded evaluation for large resolution. */ - const int num_blocks = TaskScheduler::num_threads(); - const int chunk_size = res.y / num_blocks; - int start_row = 0; - TaskPool pool; - for (int i = 0; i < num_blocks; ++i) { - const int current_chunk_size = (i != num_blocks - 1) ? chunk_size : (res.y - i * chunk_size); - pool.push(function_bind(&background_cdf, - start_row, - start_row + current_chunk_size, - res.x, - res.y, - &pixels, - cond_cdf)); - start_row += current_chunk_size; - } - pool.wait_work(); - } + + /* Create CDF in parallel. */ + const int rows_per_task = divide_up(10240, res.x); + parallel_for(blocked_range<size_t>(0, res.y, rows_per_task), + [&](const blocked_range<size_t> &r) { + background_cdf(r.begin(), r.end(), res.x, res.y, &pixels, cond_cdf); + }); /* marginal CDFs (column, V direction, sum of rows) */ marg_cdf[0].x = cond_cdf[res.x].x; diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp index cdcaeb246dd..ab392839e52 100644 --- a/intern/cycles/render/nodes.cpp +++ b/intern/cycles/render/nodes.cpp @@ -19,6 +19,7 @@ #include "render/constant_fold.h" #include "render/film.h" #include "render/image.h" +#include "render/image_sky.h" #include "render/integrator.h" #include "render/light.h" #include "render/mesh.h" @@ -630,7 +631,7 @@ typedef struct SunSky { /* Parameter */ float radiance_x, radiance_y, radiance_z; - float config_x[9], config_y[9], config_z[9]; + float config_x[9], config_y[9], config_z[9], nishita_data[9]; } SunSky; /* Preetham model */ @@ -640,7 +641,7 @@ static float sky_perez_function(float lam[6], float theta, float gamma) (1.0f + lam[2] * expf(lam[3] * gamma) + lam[4] * cosf(gamma) * cosf(gamma)); } -static void sky_texture_precompute_old(SunSky *sunsky, float3 dir, float turbidity) +static void sky_texture_precompute_preetham(SunSky *sunsky, float3 dir, float turbidity) { /* * We re-use the SunSky struct of the new model, to avoid extra variables @@ -703,10 +704,10 @@ static void sky_texture_precompute_old(SunSky *sunsky, float3 dir, float turbidi } /* Hosek / Wilkie */ -static void sky_texture_precompute_new(SunSky *sunsky, - float3 dir, - float turbidity, - float ground_albedo) +static void sky_texture_precompute_hosek(SunSky *sunsky, + float3 dir, + float turbidity, + float ground_albedo) { /* Calculate Sun Direction and save coordinates */ float2 spherical = sky_spherical_coordinates(dir); @@ -743,6 +744,34 @@ static void sky_texture_precompute_new(SunSky *sunsky, arhosekskymodelstate_free(sky_state); } +/* Nishita improved */ +static void sky_texture_precompute_nishita(SunSky *sunsky, + bool sun_disc, + float sun_size, + float sun_elevation, + float sun_rotation, + int altitude, + float air_density, + float dust_density) +{ + /* sample 2 sun pixels */ + float pixel_bottom[3]; + float pixel_top[3]; + float altitude_f = (float)altitude; + nishita_skymodel_precompute_sun( + sun_elevation, sun_size, altitude_f, air_density, dust_density, pixel_bottom, pixel_top); + /* send data to svm_sky */ + sunsky->nishita_data[0] = pixel_bottom[0]; + sunsky->nishita_data[1] = pixel_bottom[1]; + sunsky->nishita_data[2] = pixel_bottom[2]; + sunsky->nishita_data[3] = pixel_top[0]; + sunsky->nishita_data[4] = pixel_top[1]; + sunsky->nishita_data[5] = pixel_top[2]; + sunsky->nishita_data[6] = sun_elevation; + sunsky->nishita_data[7] = M_2PI_F - sun_rotation; + sunsky->nishita_data[8] = sun_disc ? sun_size : 0.0f; +} + NODE_DEFINE(SkyTextureNode) { NodeType *type = NodeType::add("sky_texture", create, NodeType::SHADER); @@ -750,13 +779,22 @@ NODE_DEFINE(SkyTextureNode) TEXTURE_MAPPING_DEFINE(SkyTextureNode); static NodeEnum type_enum; - type_enum.insert("preetham", NODE_SKY_OLD); - type_enum.insert("hosek_wilkie", NODE_SKY_NEW); - SOCKET_ENUM(type, "Type", type_enum, NODE_SKY_NEW); + type_enum.insert("preetham", NODE_SKY_PREETHAM); + type_enum.insert("hosek_wilkie", NODE_SKY_HOSEK); + type_enum.insert("nishita_improved", NODE_SKY_NISHITA); + SOCKET_ENUM(type, "Type", type_enum, NODE_SKY_NISHITA); SOCKET_VECTOR(sun_direction, "Sun Direction", make_float3(0.0f, 0.0f, 1.0f)); SOCKET_FLOAT(turbidity, "Turbidity", 2.2f); SOCKET_FLOAT(ground_albedo, "Ground Albedo", 0.3f); + SOCKET_BOOLEAN(sun_disc, "Sun Disc", true); + SOCKET_FLOAT(sun_size, "Sun Size", 0.009512f); + SOCKET_FLOAT(sun_elevation, "Sun Elevation", M_PI_2_F); + SOCKET_FLOAT(sun_rotation, "Sun Rotation", 0.0f); + SOCKET_INT(altitude, "Altitude", 0); + SOCKET_FLOAT(air_density, "Air", 1.0f); + SOCKET_FLOAT(dust_density, "Dust", 1.0f); + SOCKET_FLOAT(ozone_density, "Ozone", 1.0f); SOCKET_IN_POINT( vector, "Vector", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TEXTURE_GENERATED); @@ -776,10 +814,32 @@ void SkyTextureNode::compile(SVMCompiler &compiler) ShaderOutput *color_out = output("Color"); SunSky sunsky; - if (type == NODE_SKY_OLD) - sky_texture_precompute_old(&sunsky, sun_direction, turbidity); - else if (type == NODE_SKY_NEW) - sky_texture_precompute_new(&sunsky, sun_direction, turbidity, ground_albedo); + if (type == NODE_SKY_PREETHAM) + sky_texture_precompute_preetham(&sunsky, sun_direction, turbidity); + else if (type == NODE_SKY_HOSEK) + sky_texture_precompute_hosek(&sunsky, sun_direction, turbidity, ground_albedo); + else if (type == NODE_SKY_NISHITA) { + sky_texture_precompute_nishita(&sunsky, + sun_disc, + sun_size, + sun_elevation, + sun_rotation, + altitude, + air_density, + dust_density); + /* precomputed texture image parameters */ + ImageManager *image_manager = compiler.scene->image_manager; + ImageParams impar; + impar.interpolation = INTERPOLATION_LINEAR; + impar.extension = EXTENSION_EXTEND; + + /* precompute sky texture */ + if (handle.empty()) { + SkyLoader *loader = new SkyLoader( + sun_elevation, altitude, air_density, dust_density, ozone_density); + handle = image_manager->add_image(loader, impar); + } + } else assert(false); @@ -787,38 +847,52 @@ void SkyTextureNode::compile(SVMCompiler &compiler) compiler.stack_assign(color_out); compiler.add_node(NODE_TEX_SKY, vector_offset, compiler.stack_assign(color_out), type); - compiler.add_node(__float_as_uint(sunsky.phi), - __float_as_uint(sunsky.theta), - __float_as_uint(sunsky.radiance_x), - __float_as_uint(sunsky.radiance_y)); - compiler.add_node(__float_as_uint(sunsky.radiance_z), - __float_as_uint(sunsky.config_x[0]), - __float_as_uint(sunsky.config_x[1]), - __float_as_uint(sunsky.config_x[2])); - compiler.add_node(__float_as_uint(sunsky.config_x[3]), - __float_as_uint(sunsky.config_x[4]), - __float_as_uint(sunsky.config_x[5]), - __float_as_uint(sunsky.config_x[6])); - compiler.add_node(__float_as_uint(sunsky.config_x[7]), - __float_as_uint(sunsky.config_x[8]), - __float_as_uint(sunsky.config_y[0]), - __float_as_uint(sunsky.config_y[1])); - compiler.add_node(__float_as_uint(sunsky.config_y[2]), - __float_as_uint(sunsky.config_y[3]), - __float_as_uint(sunsky.config_y[4]), - __float_as_uint(sunsky.config_y[5])); - compiler.add_node(__float_as_uint(sunsky.config_y[6]), - __float_as_uint(sunsky.config_y[7]), - __float_as_uint(sunsky.config_y[8]), - __float_as_uint(sunsky.config_z[0])); - compiler.add_node(__float_as_uint(sunsky.config_z[1]), - __float_as_uint(sunsky.config_z[2]), - __float_as_uint(sunsky.config_z[3]), - __float_as_uint(sunsky.config_z[4])); - compiler.add_node(__float_as_uint(sunsky.config_z[5]), - __float_as_uint(sunsky.config_z[6]), - __float_as_uint(sunsky.config_z[7]), - __float_as_uint(sunsky.config_z[8])); + /* nishita doesn't need this data */ + if (type != NODE_SKY_NISHITA) { + compiler.add_node(__float_as_uint(sunsky.phi), + __float_as_uint(sunsky.theta), + __float_as_uint(sunsky.radiance_x), + __float_as_uint(sunsky.radiance_y)); + compiler.add_node(__float_as_uint(sunsky.radiance_z), + __float_as_uint(sunsky.config_x[0]), + __float_as_uint(sunsky.config_x[1]), + __float_as_uint(sunsky.config_x[2])); + compiler.add_node(__float_as_uint(sunsky.config_x[3]), + __float_as_uint(sunsky.config_x[4]), + __float_as_uint(sunsky.config_x[5]), + __float_as_uint(sunsky.config_x[6])); + compiler.add_node(__float_as_uint(sunsky.config_x[7]), + __float_as_uint(sunsky.config_x[8]), + __float_as_uint(sunsky.config_y[0]), + __float_as_uint(sunsky.config_y[1])); + compiler.add_node(__float_as_uint(sunsky.config_y[2]), + __float_as_uint(sunsky.config_y[3]), + __float_as_uint(sunsky.config_y[4]), + __float_as_uint(sunsky.config_y[5])); + compiler.add_node(__float_as_uint(sunsky.config_y[6]), + __float_as_uint(sunsky.config_y[7]), + __float_as_uint(sunsky.config_y[8]), + __float_as_uint(sunsky.config_z[0])); + compiler.add_node(__float_as_uint(sunsky.config_z[1]), + __float_as_uint(sunsky.config_z[2]), + __float_as_uint(sunsky.config_z[3]), + __float_as_uint(sunsky.config_z[4])); + compiler.add_node(__float_as_uint(sunsky.config_z[5]), + __float_as_uint(sunsky.config_z[6]), + __float_as_uint(sunsky.config_z[7]), + __float_as_uint(sunsky.config_z[8])); + } + else { + compiler.add_node(__float_as_uint(sunsky.nishita_data[0]), + __float_as_uint(sunsky.nishita_data[1]), + __float_as_uint(sunsky.nishita_data[2]), + __float_as_uint(sunsky.nishita_data[3])); + compiler.add_node(__float_as_uint(sunsky.nishita_data[4]), + __float_as_uint(sunsky.nishita_data[5]), + __float_as_uint(sunsky.nishita_data[6]), + __float_as_uint(sunsky.nishita_data[7])); + compiler.add_node(__float_as_uint(sunsky.nishita_data[8]), handle.svm_slot(), 0, 0); + } tex_mapping.compile_end(compiler, vector_in, vector_offset); } @@ -828,10 +902,32 @@ void SkyTextureNode::compile(OSLCompiler &compiler) tex_mapping.compile(compiler); SunSky sunsky; - if (type == NODE_SKY_OLD) - sky_texture_precompute_old(&sunsky, sun_direction, turbidity); - else if (type == NODE_SKY_NEW) - sky_texture_precompute_new(&sunsky, sun_direction, turbidity, ground_albedo); + if (type == NODE_SKY_PREETHAM) + sky_texture_precompute_preetham(&sunsky, sun_direction, turbidity); + else if (type == NODE_SKY_HOSEK) + sky_texture_precompute_hosek(&sunsky, sun_direction, turbidity, ground_albedo); + else if (type == NODE_SKY_NISHITA) { + sky_texture_precompute_nishita(&sunsky, + sun_disc, + sun_size, + sun_elevation, + sun_rotation, + altitude, + air_density, + dust_density); + /* precomputed texture image parameters */ + ImageManager *image_manager = compiler.scene->image_manager; + ImageParams impar; + impar.interpolation = INTERPOLATION_LINEAR; + impar.extension = EXTENSION_EXTEND; + + /* precompute sky texture */ + if (handle.empty()) { + SkyLoader *loader = new SkyLoader( + sun_elevation, altitude, air_density, dust_density, ozone_density); + handle = image_manager->add_image(loader, impar); + } + } else assert(false); @@ -843,6 +939,11 @@ void SkyTextureNode::compile(OSLCompiler &compiler) compiler.parameter_array("config_x", sunsky.config_x, 9); compiler.parameter_array("config_y", sunsky.config_y, 9); compiler.parameter_array("config_z", sunsky.config_z, 9); + compiler.parameter_array("nishita_data", sunsky.nishita_data, 9); + /* nishita texture */ + if (type == NODE_SKY_NISHITA) { + compiler.parameter_texture("filename", handle.svm_slot()); + } compiler.add(this, "node_sky_texture"); } diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h index 83c3ad071ae..846ba7423e5 100644 --- a/intern/cycles/render/nodes.h +++ b/intern/cycles/render/nodes.h @@ -168,7 +168,16 @@ class SkyTextureNode : public TextureNode { float3 sun_direction; float turbidity; float ground_albedo; + bool sun_disc; + float sun_size; + float sun_elevation; + float sun_rotation; + int altitude; + float air_density; + float dust_density; + float ozone_density; float3 vector; + ImageHandle handle; }; class OutputNode : public ShaderNode { diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp index 61deef4cd76..c45ae5553a8 100644 --- a/intern/cycles/render/object.cpp +++ b/intern/cycles/render/object.cpp @@ -31,6 +31,7 @@ #include "util/util_murmurhash.h" #include "util/util_progress.h" #include "util/util_set.h" +#include "util/util_task.h" #include "util/util_vector.h" #include "subd/subd_patch_table.h" @@ -77,7 +78,6 @@ struct UpdateObjectTransformState { Scene *scene; /* Some locks to keep everything thread-safe. */ - thread_spin_lock queue_lock; thread_spin_lock surface_area_lock; /* First unused object index in the queue. */ @@ -219,7 +219,6 @@ void Object::tag_update(Scene *scene) } scene->camera->need_flags_update = true; - scene->curve_system_manager->need_update = true; scene->geometry_manager->need_update = true; scene->object_manager->need_update = true; } @@ -550,41 +549,6 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s } } -bool ObjectManager::device_update_object_transform_pop_work(UpdateObjectTransformState *state, - int *start_index, - int *num_objects) -{ - /* Tweakable parameter, number of objects per chunk. - * Too small value will cause some extra overhead due to spin lock, - * too big value might not use all threads nicely. - */ - static const int OBJECTS_PER_TASK = 32; - bool have_work = false; - state->queue_lock.lock(); - int num_scene_objects = state->scene->objects.size(); - if (state->queue_start_object < num_scene_objects) { - int count = min(OBJECTS_PER_TASK, num_scene_objects - state->queue_start_object); - *start_index = state->queue_start_object; - *num_objects = count; - state->queue_start_object += count; - have_work = true; - } - state->queue_lock.unlock(); - return have_work; -} - -void ObjectManager::device_update_object_transform_task(UpdateObjectTransformState *state) -{ - int start_index, num_objects; - while (device_update_object_transform_pop_work(state, &start_index, &num_objects)) { - for (int i = 0; i < num_objects; ++i) { - const int object_index = start_index + i; - Object *ob = state->scene->objects[object_index]; - device_update_object_transform(state, ob); - } - } -} - void ObjectManager::device_update_transforms(DeviceScene *dscene, Scene *scene, Progress &progress) { UpdateObjectTransformState state; @@ -630,28 +594,19 @@ void ObjectManager::device_update_transforms(DeviceScene *dscene, Scene *scene, numparticles += psys->particles.size(); } - /* NOTE: If it's just a handful of objects we deal with them in a single - * thread to avoid threading overhead. However, this threshold is might - * need some tweaks to make mid-complex scenes optimal. - */ - if (scene->objects.size() < 64) { - foreach (Object *ob, scene->objects) { - device_update_object_transform(&state, ob); - if (progress.get_cancel()) { - return; - } - } - } - else { - const int num_threads = TaskScheduler::num_threads(); - TaskPool pool; - for (int i = 0; i < num_threads; ++i) { - pool.push(function_bind(&ObjectManager::device_update_object_transform_task, this, &state)); - } - pool.wait_work(); - if (progress.get_cancel()) { - return; - } + /* Parallel object update, with grain size to avoid too much threading overhead + * for individual objects. */ + static const int OBJECTS_PER_TASK = 32; + parallel_for(blocked_range<size_t>(0, scene->objects.size(), OBJECTS_PER_TASK), + [&](const blocked_range<size_t> &r) { + for (size_t i = r.begin(); i != r.end(); i++) { + Object *ob = state.scene->objects[i]; + device_update_object_transform(&state, ob); + } + }); + + if (progress.get_cancel()) { + return; } dscene->objects.copy_to_device(); @@ -664,7 +619,6 @@ void ObjectManager::device_update_transforms(DeviceScene *dscene, Scene *scene, dscene->data.bvh.have_motion = state.have_motion; dscene->data.bvh.have_curves = state.have_curves; - dscene->data.bvh.have_instancing = true; } void ObjectManager::device_update(Device *device, @@ -839,7 +793,6 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, P bool motion_blur = need_motion == Scene::MOTION_BLUR; bool apply_to_motion = need_motion != Scene::MOTION_PASS; int i = 0; - bool have_instancing = false; foreach (Object *object, scene->objects) { map<Geometry *, int>::iterator it = geometry_users.find(object->geometry); @@ -885,22 +838,15 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, P if (geom->transform_negative_scaled) object_flag[i] |= SD_OBJECT_NEGATIVE_SCALE_APPLIED; } - else - have_instancing = true; } - else - have_instancing = true; i++; } - - dscene->data.bvh.have_instancing = have_instancing; } void ObjectManager::tag_update(Scene *scene) { need_update = true; - scene->curve_system_manager->need_update = true; scene->geometry_manager->need_update = true; scene->light_manager->need_update = true; } diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp index f5b68d5a4fe..9016a8d325f 100644 --- a/intern/cycles/render/scene.cpp +++ b/intern/cycles/render/scene.cpp @@ -108,7 +108,6 @@ Scene::Scene(const SceneParams ¶ms_, Device *device) integrator = new Integrator(); image_manager = new ImageManager(device->info); particle_system_manager = new ParticleSystemManager(); - curve_system_manager = new CurveSystemManager(); bake_manager = new BakeManager(); /* OSL only works on the CPU */ @@ -156,7 +155,6 @@ void Scene::free_memory(bool final) light_manager->device_free(device, &dscene); particle_system_manager->device_free(device, &dscene); - curve_system_manager->device_free(device, &dscene); bake_manager->device_free(device, &dscene); @@ -180,7 +178,6 @@ void Scene::free_memory(bool final) delete shader_manager; delete light_manager; delete particle_system_manager; - delete curve_system_manager; delete image_manager; delete bake_manager; } @@ -233,12 +230,6 @@ void Scene::device_update(Device *device_, Progress &progress) if (progress.get_cancel() || device->have_error()) return; - progress.set_status("Updating Hair Systems"); - curve_system_manager->device_update(device, &dscene, this, progress); - - if (progress.get_cancel() || device->have_error()) - return; - progress.set_status("Updating Particle Systems"); particle_system_manager->device_update(device, &dscene, this, progress); @@ -369,8 +360,7 @@ bool Scene::need_data_update() return (background->need_update || image_manager->need_update || object_manager->need_update || geometry_manager->need_update || light_manager->need_update || lookup_tables->need_update || integrator->need_update || shader_manager->need_update || - particle_system_manager->need_update || curve_system_manager->need_update || - bake_manager->need_update || film->need_update); + particle_system_manager->need_update || bake_manager->need_update || film->need_update); } bool Scene::need_reset() @@ -393,7 +383,6 @@ void Scene::reset() geometry_manager->tag_update(this); light_manager->tag_update(this); particle_system_manager->tag_update(this); - curve_system_manager->tag_update(this); } void Scene::device_free() diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h index 6b10a901d7b..67616262c03 100644 --- a/intern/cycles/render/scene.h +++ b/intern/cycles/render/scene.h @@ -168,6 +168,8 @@ class SceneParams { bool use_bvh_spatial_split; bool use_bvh_unaligned_nodes; int num_bvh_time_steps; + int hair_subdivisions; + CurveShapeType hair_shape; bool persistent_data; int texture_limit; @@ -181,6 +183,8 @@ class SceneParams { use_bvh_spatial_split = false; use_bvh_unaligned_nodes = true; num_bvh_time_steps = 0; + hair_subdivisions = 3; + hair_shape = CURVE_RIBBON; persistent_data = false; texture_limit = 0; background = true; @@ -193,8 +197,15 @@ class SceneParams { use_bvh_spatial_split == params.use_bvh_spatial_split && use_bvh_unaligned_nodes == params.use_bvh_unaligned_nodes && num_bvh_time_steps == params.num_bvh_time_steps && + hair_subdivisions == params.hair_subdivisions && hair_shape == params.hair_shape && persistent_data == params.persistent_data && texture_limit == params.texture_limit); } + + int curve_subdivisions() + { + /* Matching the tesselation rate limit in Embree. */ + return clamp(1 << hair_subdivisions, 1, 16); + } }; /* Scene */ @@ -226,7 +237,6 @@ class Scene { GeometryManager *geometry_manager; ObjectManager *object_manager; ParticleSystemManager *particle_system_manager; - CurveSystemManager *curve_system_manager; BakeManager *bake_manager; /* default shaders */ diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index f5bfebbaf78..1a94d3e9db7 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -61,8 +61,10 @@ Session::Session(const SessionParams ¶ms_) TaskScheduler::init(params.threads); + /* Create CPU/GPU devices. */ device = Device::create(params.device, stats, profiler, params.background); + /* Create buffers for interactive rendering. */ if (params.background && !params.write_render_cb) { buffers = NULL; display = NULL; @@ -72,6 +74,9 @@ Session::Session(const SessionParams ¶ms_) display = new DisplayBuffer(device, params.display_buffer_linear); } + /* Validate denoising parameters. */ + set_denoising(params.denoising); + session_thread = NULL; scene = NULL; @@ -773,6 +778,7 @@ DeviceRequestedFeatures Session::get_requested_device_features() */ bool use_motion = scene->need_motion() == Scene::MotionType::MOTION_BLUR; requested_features.use_hair = false; + requested_features.use_hair_thick = (scene->params.hair_shape == CURVE_THICK); requested_features.use_object_motion = false; requested_features.use_camera_motion = use_motion && scene->camera->use_motion(); foreach (Object *object, scene->objects) { @@ -804,7 +810,7 @@ DeviceRequestedFeatures Session::get_requested_device_features() requested_features.use_baking = bake_manager->get_baking(); requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH); - if (params.run_denoising) { + if (params.denoising.use || params.denoising.store_passes) { requested_features.use_denoising = true; requested_features.use_shadow_tricks = true; } @@ -941,24 +947,35 @@ void Session::set_pause(bool pause_) pause_cond.notify_all(); } -void Session::set_denoising(bool denoising, bool optix_denoising) +void Session::set_denoising(const DenoiseParams &denoising) { + bool need_denoise = denoising.need_denoising_task(); + /* Lock buffers so no denoising operation is triggered while the settings are changed here. */ thread_scoped_lock buffers_lock(buffers_mutex); + params.denoising = denoising; + + if (!(params.device.denoisers & denoising.type)) { + if (need_denoise) { + progress.set_error("Denoiser type not supported by compute device"); + } - params.run_denoising = denoising; - params.full_denoising = !optix_denoising; - params.optix_denoising = optix_denoising; + params.denoising.use = false; + need_denoise = false; + } // TODO(pmours): Query the required overlap value for denoising from the device? - tile_manager.slice_overlap = denoising && !params.background ? 64 : 0; - tile_manager.schedule_denoising = denoising && !buffers; + tile_manager.slice_overlap = need_denoise && !params.background ? 64 : 0; + + /* Schedule per tile denoising for final renders if we are either denoising or + * need prefiltered passes for the native denoiser. */ + tile_manager.schedule_denoising = need_denoise && !buffers; } void Session::set_denoising_start_sample(int sample) { - if (sample != params.denoising_start_sample) { - params.denoising_start_sample = sample; + if (sample != params.denoising.start_sample) { + params.denoising.start_sample = sample; pause_cond.notify_all(); } @@ -1078,10 +1095,10 @@ void Session::update_status_time(bool show_pause, bool show_done) */ substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples); } - if (params.full_denoising || params.optix_denoising) { + if (params.denoising.use && params.denoising.type != DENOISER_OPENIMAGEDENOISE) { substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles()); } - else if (params.run_denoising) { + else if (params.denoising.store_passes && params.denoising.type == DENOISER_NLM) { substatus += string_printf(", Prefiltered %d tiles", progress.get_denoised_tiles()); } } @@ -1110,7 +1127,7 @@ bool Session::render_need_denoise(bool &delayed) delayed = false; /* Denoising enabled? */ - if (!params.run_denoising) { + if (!params.denoising.need_denoising_task()) { return false; } @@ -1127,7 +1144,7 @@ bool Session::render_need_denoise(bool &delayed) } /* Do not denoise until the sample at which denoising should start is reached. */ - if (tile_manager.state.sample < params.denoising_start_sample) { + if (tile_manager.state.sample < min(params.denoising.start_sample, params.samples - 1)) { return false; } @@ -1178,9 +1195,6 @@ void Session::render(bool need_denoise) task.pass_denoising_clean = scene->film->denoising_clean_offset; task.denoising_from_render = true; - task.denoising_do_filter = params.full_denoising; - task.denoising_use_optix = params.optix_denoising; - task.denoising_write_passes = params.write_denoising_passes; if (tile_manager.schedule_denoising) { /* Acquire denoising tiles during rendering. */ diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index 2707eed5531..0141629762c 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -62,10 +62,6 @@ class SessionParams { bool display_buffer_linear; - bool run_denoising; - bool write_denoising_passes; - bool full_denoising; - bool optix_denoising; DenoiseParams denoising; double cancel_timeout; @@ -94,11 +90,6 @@ class SessionParams { use_profiling = false; - run_denoising = false; - write_denoising_passes = false; - full_denoising = false; - optix_denoising = false; - display_buffer_linear = false; cancel_timeout = 0.1; @@ -125,7 +116,8 @@ class SessionParams { cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout && text_timeout == params.text_timeout && progressive_update_timeout == params.progressive_update_timeout && - tile_order == params.tile_order && shadingsystem == params.shadingsystem); + tile_order == params.tile_order && shadingsystem == params.shadingsystem && + denoising.type == params.denoising.type); } }; @@ -161,7 +153,7 @@ class Session { void reset(BufferParams ¶ms, int samples); void set_pause(bool pause); void set_samples(int samples); - void set_denoising(bool denoising, bool optix_denoising); + void set_denoising(const DenoiseParams &denoising); void set_denoising_start_sample(int sample); bool update_scene(); diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp index 39ba45a751a..1120d909e98 100644 --- a/intern/cycles/render/shader.cpp +++ b/intern/cycles/render/shader.cpp @@ -33,6 +33,7 @@ #include "util/util_foreach.h" #include "util/util_murmurhash.h" +#include "util/util_task.h" #ifdef WITH_OCIO # include <OpenColorIO/OpenColorIO.h> diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp index ea3dbaf8e03..88714e20a90 100644 --- a/intern/cycles/render/svm.cpp +++ b/intern/cycles/render/svm.cpp @@ -94,8 +94,7 @@ void SVMShaderManager::device_update(Device *device, scene, scene->shaders[i], &progress, - &shader_svm_nodes[i]), - false); + &shader_svm_nodes[i])); } task_pool.wait_work(); diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp index 87389ebfb16..4ea3470cda8 100644 --- a/intern/cycles/test/render_graph_finalize_test.cpp +++ b/intern/cycles/test/render_graph_finalize_test.cpp @@ -17,11 +17,15 @@ #include "testing/mock_log.h" #include "testing/testing.h" +#include "device/device.h" + #include "render/graph.h" #include "render/nodes.h" #include "render/scene.h" + #include "util/util_array.h" #include "util/util_logging.h" +#include "util/util_stats.h" #include "util/util_string.h" #include "util/util_vector.h" diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index c1f71461dfd..ad4ea9c86e0 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -29,7 +29,7 @@ set(SRC ) set(LIB - + ${TBB_LIBRARIES} ) if(WITH_CYCLES_STANDALONE) @@ -86,6 +86,7 @@ set(SRC_HEADERS util_math_matrix.h util_md5.h util_murmurhash.h + util_openimagedenoise.h util_opengl.h util_optimization.h util_param.h @@ -100,6 +101,7 @@ set(SRC_HEADERS util_sky_model.cpp util_sky_model.h util_sky_model_data.h + util_sky_nishita.cpp util_avxf.h util_avxb.h util_semaphore.h @@ -112,6 +114,7 @@ set(SRC_HEADERS util_string.h util_system.h util_task.h + util_tbb.h util_texture.h util_thread.h util_time.h diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index 3ce65802cff..6ad4f709ab5 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -31,7 +31,7 @@ DebugFlags::CPU::CPU() sse41(true), sse3(true), sse2(true), - bvh_layout(BVH_LAYOUT_DEFAULT), + bvh_layout(BVH_LAYOUT_AUTO), split_kernel(false) { reset(); @@ -57,18 +57,7 @@ void DebugFlags::CPU::reset() #undef STRINGIFY #undef CHECK_CPU_FLAGS - if (getenv("CYCLES_BVH2") != NULL) { - bvh_layout = BVH_LAYOUT_BVH2; - } - else if (getenv("CYCLES_BVH4") != NULL) { - bvh_layout = BVH_LAYOUT_BVH4; - } - else if (getenv("CYCLES_BVH8") != NULL) { - bvh_layout = BVH_LAYOUT_BVH8; - } - else { - bvh_layout = BVH_LAYOUT_DEFAULT; - } + bvh_layout = BVH_LAYOUT_AUTO; split_kernel = false; } diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index cf6b442b878..da9f5408b59 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -73,10 +73,10 @@ class DebugFlags { return sse2; } - /* Requested BVH size. + /* Requested BVH layout. * - * Rendering will use widest possible BVH which is below or equal - * this one. + * By default the fastest will be used. For debugging the BVH used by other + * CPUs and GPUs can be selected here instead. */ BVHLayout bvh_layout; diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h index dbed83ab84d..e979bd9e0c0 100644 --- a/intern/cycles/util/util_math_fast.h +++ b/intern/cycles/util/util_math_fast.h @@ -446,6 +446,11 @@ ccl_device_inline float fast_expf(float x) } #ifndef __KERNEL_GPU__ +/* MSVC seems to have a code-gen bug here in at least SSE41/AVX + * see T78047 for details. */ +# ifdef _MSC_VER +# pragma optimize("", off) +# endif ccl_device float4 fast_exp2f4(float4 x) { const float4 one = make_float4(1.0f); @@ -461,6 +466,9 @@ ccl_device float4 fast_exp2f4(float4 x) r = madd4(x, r, make_float4(1.0f)); return __int4_as_float4(__float4_as_int4(r) + (m << 23)); } +# ifdef _MSC_VER +# pragma optimize("", on) +# endif ccl_device_inline float4 fast_expf4(float4 x) { diff --git a/intern/cycles/util/util_openimagedenoise.h b/intern/cycles/util/util_openimagedenoise.h new file mode 100644 index 00000000000..aafa69cb530 --- /dev/null +++ b/intern/cycles/util/util_openimagedenoise.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_OPENIMAGEDENOISE_H__ +#define __UTIL_OPENIMAGEDENOISE_H__ + +#ifdef WITH_OPENIMAGEDENOISE +# include <OpenImageDenoise/oidn.hpp> +#endif + +#include "util_system.h" + +CCL_NAMESPACE_BEGIN + +static inline bool openimagedenoise_supported() +{ +#ifdef WITH_OPENIMAGEDENOISE + return system_cpu_support_sse41(); +#else + return false; +#endif +} + +CCL_NAMESPACE_END + +#endif /* __UTIL_OPENIMAGEDENOISE_H__ */ diff --git a/intern/cycles/util/util_sky_model.h b/intern/cycles/util/util_sky_model.h index 84340614b2c..36f1079a16d 100644 --- a/intern/cycles/util/util_sky_model.h +++ b/intern/cycles/util/util_sky_model.h @@ -298,6 +298,8 @@ HINT #1: if you want to model the sky of an earth-like planet that orbits previous paragraph. */ +#include "util/util_types.h" + CCL_NAMESPACE_BEGIN #ifndef _SKY_MODEL_H_ @@ -426,4 +428,26 @@ double arhosekskymodel_solar_radiance(ArHosekSkyModelState *state, #endif // _SKY_MODEL_H_ +/* Nishita improved sky model */ + +void nishita_skymodel_precompute_texture(float *pixels, + int stride, + int start_y, + int end_y, + int width, + int height, + float sun_elevation, + float altitude, + float air_density, + float dust_density, + float ozone_density); + +void nishita_skymodel_precompute_sun(float sun_elevation, + float angular_diameter, + float altitude, + float air_density, + float dust_density, + float *pixel_bottom, + float *pixel_top); + CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_sky_nishita.cpp b/intern/cycles/util/util_sky_nishita.cpp new file mode 100644 index 00000000000..92397804d43 --- /dev/null +++ b/intern/cycles/util/util_sky_nishita.cpp @@ -0,0 +1,371 @@ +/* + * Copyright 2011-2020 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "util/util_math.h" +#include "util/util_sky_model.h" + +CCL_NAMESPACE_BEGIN + +/* Constants */ +static const float rayleigh_scale = 8000.0f; // Rayleigh scale height (m) +static const float mie_scale = 1200.0f; // Mie scale height (m) +static const float mie_coeff = 2e-5f; // Mie scattering coefficient +static const float mie_G = 0.76f; // aerosols anisotropy +static const float earth_radius = 6360000.0f; // radius of Earth (m) +static const float atmosphere_radius = 6420000.0f; // radius of atmosphere (m) +static const int steps = 32; // segments per primary ray +static const int steps_light = 16; // segments per sun connection ray +static const int num_wavelengths = 21; // number of wavelengths +/* irradiance at top of atmosphere */ +static const float irradiance[] = { + 1.45756829855592995315f, 1.56596305559738380175f, 1.65148449067670455293f, + 1.71496242737209314555f, 1.75797983805020541226f, 1.78256407885924539336f, + 1.79095108475838560302f, 1.78541550133410664714f, 1.76815554864306845317f, + 1.74122069647250410362f, 1.70647127164943679389f, 1.66556087452739887134f, + 1.61993437242451854274f, 1.57083597368892080581f, 1.51932335059305478886f, + 1.46628494965214395407f, 1.41245852740172450623f, 1.35844961970384092709f, + 1.30474913844739281998f, 1.25174963272610817455f, 1.19975998755420620867f}; +/* Rayleigh scattering coefficient */ +static const float rayleigh_coeff[] = { + 0.00005424820087636473f, 0.00004418549866505454f, 0.00003635151910165377f, + 0.00003017929012024763f, 0.00002526320226989157f, 0.00002130859310621843f, + 0.00001809838025320633f, 0.00001547057129129042f, 0.00001330284977336850f, + 0.00001150184784075764f, 0.00000999557429990163f, 0.00000872799973630707f, + 0.00000765513700977967f, 0.00000674217203751443f, 0.00000596134125832052f, + 0.00000529034598065810f, 0.00000471115687557433f, 0.00000420910481110487f, + 0.00000377218381260133f, 0.00000339051255477280f, 0.00000305591531679811f}; +/* Ozone absorption coefficient */ +static const float ozone_coeff[] = { + 0.00000000325126849861f, 0.00000000585395365047f, 0.00000001977191155085f, + 0.00000007309568762914f, 0.00000020084561514287f, 0.00000040383958096161f, + 0.00000063551335912363f, 0.00000096707041180970f, 0.00000154797400424410f, + 0.00000209038647223331f, 0.00000246128056164565f, 0.00000273551299461512f, + 0.00000215125863128643f, 0.00000159051840791988f, 0.00000112356197979857f, + 0.00000073527551487574f, 0.00000046450130357806f, 0.00000033096079921048f, + 0.00000022512612292678f, 0.00000014879129266490f, 0.00000016828623364192f}; +/* CIE XYZ color matching functions */ +static const float cmf_xyz[][3] = {{0.00136800000f, 0.00003900000f, 0.00645000100f}, + {0.01431000000f, 0.00039600000f, 0.06785001000f}, + {0.13438000000f, 0.00400000000f, 0.64560000000f}, + {0.34828000000f, 0.02300000000f, 1.74706000000f}, + {0.29080000000f, 0.06000000000f, 1.66920000000f}, + {0.09564000000f, 0.13902000000f, 0.81295010000f}, + {0.00490000000f, 0.32300000000f, 0.27200000000f}, + {0.06327000000f, 0.71000000000f, 0.07824999000f}, + {0.29040000000f, 0.95400000000f, 0.02030000000f}, + {0.59450000000f, 0.99500000000f, 0.00390000000f}, + {0.91630000000f, 0.87000000000f, 0.00165000100f}, + {1.06220000000f, 0.63100000000f, 0.00080000000f}, + {0.85444990000f, 0.38100000000f, 0.00019000000f}, + {0.44790000000f, 0.17500000000f, 0.00002000000f}, + {0.16490000000f, 0.06100000000f, 0.00000000000f}, + {0.04677000000f, 0.01700000000f, 0.00000000000f}, + {0.01135916000f, 0.00410200000f, 0.00000000000f}, + {0.00289932700f, 0.00104700000f, 0.00000000000f}, + {0.00069007860f, 0.00024920000f, 0.00000000000f}, + {0.00016615050f, 0.00006000000f, 0.00000000000f}, + {0.00004150994f, 0.00001499000f, 0.00000000000f}}; + +static float3 geographical_to_direction(float lat, float lon) +{ + return make_float3(cosf(lat) * cosf(lon), cosf(lat) * sinf(lon), sinf(lat)); +} + +static float3 spec_to_xyz(float *spectrum) +{ + float3 xyz = make_float3(0.0f, 0.0f, 0.0f); + for (int i = 0; i < num_wavelengths; i++) { + xyz.x += cmf_xyz[i][0] * spectrum[i]; + xyz.y += cmf_xyz[i][1] * spectrum[i]; + xyz.z += cmf_xyz[i][2] * spectrum[i]; + } + return xyz * (20 * 683 * 1e-9f); +} + +/* Atmosphere volume models */ + +static float density_rayleigh(float height) +{ + return expf(-height / rayleigh_scale); +} + +static float density_mie(float height) +{ + return expf(-height / mie_scale); +} + +static float density_ozone(float height) +{ + float den = 0.0f; + if (height >= 10000.0f && height < 25000.0f) + den = 1.0f / 15000.0f * height - 2.0f / 3.0f; + else if (height >= 25000 && height < 40000) + den = -(1.0f / 15000.0f * height - 8.0f / 3.0f); + return den; +} + +static float phase_rayleigh(float mu) +{ + return 3.0f / (16.0f * M_PI_F) * (1.0f + sqr(mu)); +} + +static float phase_mie(float mu) +{ + static const float sqr_G = mie_G * mie_G; + + return (3.0f * (1.0f - sqr_G) * (1.0f + sqr(mu))) / + (8.0f * M_PI_F * (2.0f + sqr_G) * powf((1.0f + sqr_G - 2.0f * mie_G * mu), 1.5)); +} + +/* Intersection helpers */ +static bool surface_intersection(float3 pos, float3 dir) +{ + if (dir.z >= 0) + return false; + float t = dot(dir, -pos) / len_squared(dir); + float D = pos.x * pos.x - 2.0f * (-pos.x) * dir.x * t + dir.x * t * dir.x * t + pos.y * pos.y - + 2.0f * (-pos.y) * dir.y * t + (dir.y * t) * (dir.y * t) + pos.z * pos.z - + 2.0f * (-pos.z) * dir.z * t + dir.z * t * dir.z * t; + return (D <= sqr(earth_radius)); +} + +static float3 atmosphere_intersection(float3 pos, float3 dir) +{ + float b = -2.0f * dot(dir, -pos); + float c = len_squared(pos) - sqr(atmosphere_radius); + float t = (-b + sqrtf(b * b - 4.0f * c)) / 2.0f; + return make_float3(pos.x + dir.x * t, pos.y + dir.y * t, pos.z + dir.z * t); +} + +static float3 ray_optical_depth(float3 ray_origin, float3 ray_dir) +{ + /* This code computes the optical depth along a ray through the atmosphere. */ + float3 ray_end = atmosphere_intersection(ray_origin, ray_dir); + float ray_length = distance(ray_origin, ray_end); + + /* To compute the optical depth, we step along the ray in segments and + * accumulate the optical depth along each segment. */ + float segment_length = ray_length / steps_light; + float3 segment = segment_length * ray_dir; + + /* Instead of tracking the transmission spectrum across all wavelengths directly, + * we use the fact that the density always has the same spectrum for each type of + * scattering, so we split the density into a constant spectrum and a factor and + * only track the factors. */ + float3 optical_depth = make_float3(0.0f, 0.0f, 0.0f); + + /* The density of each segment is evaluated at its middle. */ + float3 P = ray_origin + 0.5f * segment; + for (int i = 0; i < steps_light; i++) { + /* Compute height above sea level. */ + float height = len(P) - earth_radius; + + /* Accumulate optical depth of this segment (density is assumed to be constant along it). */ + float3 density = make_float3( + density_rayleigh(height), density_mie(height), density_ozone(height)); + optical_depth += segment_length * density; + + /* Advance along ray. */ + P += segment; + } + + return optical_depth; +} + +/* Single Scattering implementation */ +static void single_scattering(float3 ray_dir, + float3 sun_dir, + float3 ray_origin, + float air_density, + float dust_density, + float ozone_density, + float *r_spectrum) +{ + /* This code computes single-inscattering along a ray through the atmosphere. */ + float3 ray_end = atmosphere_intersection(ray_origin, ray_dir); + float ray_length = distance(ray_origin, ray_end); + + /* To compute the inscattering, we step along the ray in segments and accumulate + * the inscattering as well as the optical depth along each segment. */ + float segment_length = ray_length / steps; + float3 segment = segment_length * ray_dir; + + /* Instead of tracking the transmission spectrum across all wavelengths directly, + * we use the fact that the density always has the same spectrum for each type of + * scattering, so we split the density into a constant spectrum and a factor and + * only track the factors. */ + float3 optical_depth = make_float3(0.0f, 0.0f, 0.0f); + + /* Zero out light accumulation. */ + for (int wl = 0; wl < num_wavelengths; wl++) { + r_spectrum[wl] = 0.0f; + } + + /* Compute phase function for scattering and the density scale factor. */ + float mu = dot(ray_dir, sun_dir); + float3 phase_function = make_float3(phase_rayleigh(mu), phase_mie(mu), 0.0f); + float3 density_scale = make_float3(air_density, dust_density, ozone_density); + + /* The density and in-scattering of each segment is evaluated at its middle. */ + float3 P = ray_origin + 0.5f * segment; + for (int i = 0; i < steps; i++) { + /* Compute height above sea level. */ + float height = len(P) - earth_radius; + + /* Evaluate and accumulate optical depth along the ray. */ + float3 density = density_scale * make_float3(density_rayleigh(height), + density_mie(height), + density_ozone(height)); + optical_depth += segment_length * density; + + /* If the earth isn't in the way, evaluate inscattering from the sun. */ + if (!surface_intersection(P, sun_dir)) { + float3 light_optical_depth = density_scale * ray_optical_depth(P, sun_dir); + float3 total_optical_depth = optical_depth + light_optical_depth; + + /* attenuation of light */ + for (int wl = 0; wl < num_wavelengths; wl++) { + float3 extinction_density = total_optical_depth * make_float3(rayleigh_coeff[wl], + 1.11f * mie_coeff, + ozone_coeff[wl]); + float attenuation = expf(-reduce_add(extinction_density)); + + float3 scattering_density = density * make_float3(rayleigh_coeff[wl], mie_coeff, 0.0f); + + /* The total inscattered radiance from one segment is: + * Tr(A<->B) * Tr(B<->C) * sigma_s * phase * L * segment_length + * + * These terms are: + * Tr(A<->B): Transmission from start to scattering position (tracked in optical_depth) + * Tr(B<->C): Transmission from scattering position to light (computed in + * ray_optical_depth) sigma_s: Scattering density phase: Phase function of the scattering + * type (Rayleigh or Mie) L: Radiance coming from the light source segment_length: The + * length of the segment + * + * The code here is just that, with a bit of additional optimization to not store full + * spectra for the optical depth. + */ + r_spectrum[wl] += attenuation * reduce_add(phase_function * scattering_density) * + irradiance[wl] * segment_length; + } + } + + /* Advance along ray. */ + P += segment; + } +} + +/* calculate texture array */ +void nishita_skymodel_precompute_texture(float *pixels, + int stride, + int start_y, + int end_y, + int width, + int height, + float sun_elevation, + float altitude, + float air_density, + float dust_density, + float ozone_density) +{ + /* calculate texture pixels */ + float spectrum[num_wavelengths]; + int half_width = width / 2; + float3 cam_pos = make_float3(0, 0, earth_radius + altitude); + float3 sun_dir = geographical_to_direction(sun_elevation, 0.0f); + + float latitude_step = M_PI_2_F / height; + float longitude_step = M_2PI_F / width; + + for (int y = start_y; y < end_y; y++) { + float latitude = latitude_step * y; + + float *pixel_row = pixels + (y * width) * stride; + for (int x = 0; x < half_width; x++) { + float longitude = longitude_step * x - M_PI_F; + + float3 dir = geographical_to_direction(latitude, longitude); + single_scattering(dir, sun_dir, cam_pos, air_density, dust_density, ozone_density, spectrum); + float3 xyz = spec_to_xyz(spectrum); + + pixel_row[x * stride + 0] = xyz.x; + pixel_row[x * stride + 1] = xyz.y; + pixel_row[x * stride + 2] = xyz.z; + int mirror_x = width - x - 1; + pixel_row[mirror_x * stride + 0] = xyz.x; + pixel_row[mirror_x * stride + 1] = xyz.y; + pixel_row[mirror_x * stride + 2] = xyz.z; + } + } +} + +/* Sun disc */ +static void sun_radiation(float3 cam_dir, + float altitude, + float air_density, + float dust_density, + float solid_angle, + float *r_spectrum) +{ + float3 cam_pos = make_float3(0, 0, earth_radius + altitude); + float3 optical_depth = ray_optical_depth(cam_pos, cam_dir); + + /* Compute final spectrum. */ + for (int i = 0; i < num_wavelengths; i++) { + /* Combine spectra and the optical depth into transmittance. */ + float transmittance = rayleigh_coeff[i] * optical_depth.x * air_density + + 1.11f * mie_coeff * optical_depth.y * dust_density; + r_spectrum[i] = (irradiance[i] / solid_angle) * expf(-transmittance); + } +} + +void nishita_skymodel_precompute_sun(float sun_elevation, + float angular_diameter, + float altitude, + float air_density, + float dust_density, + float *pixel_bottom, + float *pixel_top) +{ + /* definitions */ + float half_angular = angular_diameter / 2.0f; + float solid_angle = M_2PI_F * (1.0f - cosf(half_angular)); + float spectrum[num_wavelengths]; + float bottom = sun_elevation - half_angular; + float top = sun_elevation + half_angular; + float elevation_bottom, elevation_top; + float3 pix_bottom, pix_top, sun_dir; + + /* compute 2 pixels for sun disc */ + elevation_bottom = (bottom > 0.0f) ? bottom : 0.0f; + elevation_top = (top > 0.0f) ? top : 0.0f; + sun_dir = geographical_to_direction(elevation_bottom, 0.0f); + sun_radiation(sun_dir, altitude, air_density, dust_density, solid_angle, spectrum); + pix_bottom = spec_to_xyz(spectrum); + sun_dir = geographical_to_direction(elevation_top, 0.0f); + sun_radiation(sun_dir, altitude, air_density, dust_density, solid_angle, spectrum); + pix_top = spec_to_xyz(spectrum); + + /* store pixels */ + pixel_bottom[0] = pix_bottom.x; + pixel_bottom[1] = pix_bottom.y; + pixel_bottom[2] = pix_bottom.z; + pixel_top[0] = pix_top.x; + pixel_top[1] = pix_top.y; + pixel_top[2] = pix_top.z; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp index 61aa28c6815..4fb61392e92 100644 --- a/intern/cycles/util/util_task.cpp +++ b/intern/cycles/util/util_task.cpp @@ -20,100 +20,28 @@ #include "util/util_system.h" #include "util/util_time.h" -//#define THREADING_DEBUG_ENABLED - -#ifdef THREADING_DEBUG_ENABLED -# include <stdio.h> -# define THREADING_DEBUG(...) \ - do { \ - printf(__VA_ARGS__); \ - fflush(stdout); \ - } while (0) -#else -# define THREADING_DEBUG(...) -#endif - CCL_NAMESPACE_BEGIN /* Task Pool */ -TaskPool::TaskPool() +TaskPool::TaskPool() : start_time(time_dt()), num_tasks_handled(0) { - num_tasks_handled = 0; - num = 0; - do_cancel = false; } TaskPool::~TaskPool() { - stop(); + cancel(); } -void TaskPool::push(Task *task, bool front) +void TaskPool::push(TaskRunFunction &&task) { - TaskScheduler::Entry entry; - - entry.task = task; - entry.pool = this; - - TaskScheduler::push(entry, front); -} - -void TaskPool::push(const TaskRunFunction &run, bool front) -{ - push(new Task(run), front); + tbb_group.run(std::move(task)); + num_tasks_handled++; } void TaskPool::wait_work(Summary *stats) { - thread_scoped_lock num_lock(num_mutex); - - while (num != 0) { - num_lock.unlock(); - - thread_scoped_lock queue_lock(TaskScheduler::queue_mutex); - - /* find task from this pool. if we get a task from another pool, - * we can get into deadlock */ - TaskScheduler::Entry work_entry; - bool found_entry = false; - list<TaskScheduler::Entry>::iterator it; - - for (it = TaskScheduler::queue.begin(); it != TaskScheduler::queue.end(); it++) { - TaskScheduler::Entry &entry = *it; - - if (entry.pool == this) { - work_entry = entry; - found_entry = true; - TaskScheduler::queue.erase(it); - break; - } - } - - queue_lock.unlock(); - - /* if found task, do it, otherwise wait until other tasks are done */ - if (found_entry) { - /* run task */ - work_entry.task->run(0); - - /* delete task */ - delete work_entry.task; - - /* notify pool task was done */ - num_decrease(1); - } - - num_lock.lock(); - if (num == 0) - break; - - if (!found_entry) { - THREADING_DEBUG("num==%d, Waiting for condition in TaskPool::wait_work !found_entry\n", num); - num_cond.wait(num_lock); - THREADING_DEBUG("num==%d, condition wait done in TaskPool::wait_work !found_entry\n", num); - } - } + tbb_group.wait(); if (stats != NULL) { stats->time_total = time_dt() - start_time; @@ -123,193 +51,21 @@ void TaskPool::wait_work(Summary *stats) void TaskPool::cancel() { - do_cancel = true; - - TaskScheduler::clear(this); - - { - thread_scoped_lock num_lock(num_mutex); - - while (num) { - THREADING_DEBUG("num==%d, Waiting for condition in TaskPool::cancel\n", num); - num_cond.wait(num_lock); - THREADING_DEBUG("num==%d condition wait done in TaskPool::cancel\n", num); - } - } - - do_cancel = false; -} - -void TaskPool::stop() -{ - TaskScheduler::clear(this); - - assert(num == 0); + tbb_group.cancel(); + tbb_group.wait(); } bool TaskPool::canceled() { - return do_cancel; -} - -bool TaskPool::finished() -{ - thread_scoped_lock num_lock(num_mutex); - return num == 0; -} - -void TaskPool::num_decrease(int done) -{ - num_mutex.lock(); - num -= done; - - assert(num >= 0); - if (num == 0) { - THREADING_DEBUG("num==%d, notifying all in TaskPool::num_decrease\n", num); - num_cond.notify_all(); - } - - num_mutex.unlock(); -} - -void TaskPool::num_increase() -{ - thread_scoped_lock num_lock(num_mutex); - if (num_tasks_handled == 0) { - start_time = time_dt(); - } - num++; - num_tasks_handled++; - THREADING_DEBUG("num==%d, notifying all in TaskPool::num_increase\n", num); - num_cond.notify_all(); + return tbb_group.is_canceling(); } /* Task Scheduler */ thread_mutex TaskScheduler::mutex; int TaskScheduler::users = 0; -vector<thread *> TaskScheduler::threads; -bool TaskScheduler::do_exit = false; - -list<TaskScheduler::Entry> TaskScheduler::queue; -thread_mutex TaskScheduler::queue_mutex; -thread_condition_variable TaskScheduler::queue_cond; - -namespace { - -/* Get number of processors on each of the available nodes. The result is sized - * by the highest node index, and element corresponds to number of processors on - * that node. - * If node is not available, then the corresponding number of processors is - * zero. */ -void get_per_node_num_processors(vector<int> *num_per_node_processors) -{ - const int num_nodes = system_cpu_num_numa_nodes(); - if (num_nodes == 0) { - LOG(ERROR) << "Zero available NUMA nodes, is not supposed to happen."; - return; - } - num_per_node_processors->resize(num_nodes); - for (int node = 0; node < num_nodes; ++node) { - if (!system_cpu_is_numa_node_available(node)) { - (*num_per_node_processors)[node] = 0; - continue; - } - (*num_per_node_processors)[node] = system_cpu_num_numa_node_processors(node); - } -} - -/* Calculate total number of processors on all available nodes. - * This is similar to system_cpu_thread_count(), but uses pre-calculated number - * of processors on each of the node, avoiding extra system calls and checks for - * the node availability. */ -int get_num_total_processors(const vector<int> &num_per_node_processors) -{ - int num_total_processors = 0; - foreach (int num_node_processors, num_per_node_processors) { - num_total_processors += num_node_processors; - } - return num_total_processors; -} - -/* Compute NUMA node for every thread to run on, for the best performance. */ -vector<int> distribute_threads_on_nodes(const int num_threads) -{ - /* Start with all threads unassigned to any specific NUMA node. */ - vector<int> thread_nodes(num_threads, -1); - const int num_active_group_processors = system_cpu_num_active_group_processors(); - VLOG(1) << "Detected " << num_active_group_processors << " processors " - << "in active group."; - if (num_active_group_processors >= num_threads) { - /* If the current thread is set up in a way that its affinity allows to - * use at least requested number of threads we do not explicitly set - * affinity to the worker threads. - * This way we allow users to manually edit affinity of the parent - * thread, and here we follow that affinity. This way it's possible to - * have two Cycles/Blender instances running manually set to a different - * dies on a CPU. */ - VLOG(1) << "Not setting thread group affinity."; - return thread_nodes; - } - vector<int> num_per_node_processors; - get_per_node_num_processors(&num_per_node_processors); - if (num_per_node_processors.size() == 0) { - /* Error was already reported, here we can't do anything, so we simply - * leave default affinity to all the worker threads. */ - return thread_nodes; - } - const int num_nodes = num_per_node_processors.size(); - int thread_index = 0; - /* First pass: fill in all the nodes to their maximum. - * - * If there is less threads than the overall nodes capacity, some of the - * nodes or parts of them will idle. - * - * TODO(sergey): Consider picking up fastest nodes if number of threads - * fits on them. For example, on Threadripper2 we might consider using nodes - * 0 and 2 if user requested 32 render threads. */ - const int num_total_node_processors = get_num_total_processors(num_per_node_processors); - int current_node_index = 0; - while (thread_index < num_total_node_processors && thread_index < num_threads) { - const int num_node_processors = num_per_node_processors[current_node_index]; - for (int processor_index = 0; processor_index < num_node_processors; ++processor_index) { - VLOG(1) << "Scheduling thread " << thread_index << " to node " << current_node_index << "."; - thread_nodes[thread_index] = current_node_index; - ++thread_index; - if (thread_index == num_threads) { - /* All threads are scheduled on their nodes. */ - return thread_nodes; - } - } - ++current_node_index; - } - /* Second pass: keep scheduling threads to each node one by one, - * uniformly filling them in. - * This is where things becomes tricky to predict for the maximum - * performance: on the one hand this avoids too much threading overhead on - * few nodes, but for the final performance having all the overhead on one - * node might be better idea (since other nodes will have better chance of - * rendering faster). - * But more tricky is that nodes might have difference capacity, so we might - * want to do some weighted scheduling. For example, if node 0 has 16 - * processors and node 1 has 32 processors, we'd better schedule 1 extra - * thread on node 0 and 2 extra threads on node 1. */ - current_node_index = 0; - while (thread_index < num_threads) { - /* Skip unavailable nodes. */ - /* TODO(sergey): Add sanity check against deadlock. */ - while (num_per_node_processors[current_node_index] == 0) { - current_node_index = (current_node_index + 1) % num_nodes; - } - VLOG(1) << "Scheduling thread " << thread_index << " to node " << current_node_index << "."; - ++thread_index; - current_node_index = (current_node_index + 1) % num_nodes; - } - - return thread_nodes; -} - -} // namespace +int TaskScheduler::active_num_threads = 0; +tbb::global_control *TaskScheduler::global_control = nullptr; void TaskScheduler::init(int num_threads) { @@ -320,22 +76,15 @@ void TaskScheduler::init(int num_threads) if (users != 1) { return; } - do_exit = false; - const bool use_auto_threads = (num_threads == 0); - if (use_auto_threads) { + if (num_threads > 0) { /* Automatic number of threads. */ - num_threads = system_cpu_thread_count(); + VLOG(1) << "Overriding number of TBB threads to " << num_threads << "."; + global_control = new tbb::global_control(tbb::global_control::max_allowed_parallelism, + num_threads); + active_num_threads = num_threads; } - VLOG(1) << "Creating pool of " << num_threads << " threads."; - - /* Compute distribution on NUMA nodes. */ - vector<int> thread_nodes = distribute_threads_on_nodes(num_threads); - - /* Launch threads that will be waiting for work. */ - threads.resize(num_threads); - for (int thread_index = 0; thread_index < num_threads; ++thread_index) { - threads[thread_index] = new thread(function_bind(&TaskScheduler::thread_run, thread_index + 1), - thread_nodes[thread_index]); + else { + active_num_threads = system_cpu_thread_count(); } } @@ -344,105 +93,20 @@ void TaskScheduler::exit() thread_scoped_lock lock(mutex); users--; if (users == 0) { - VLOG(1) << "De-initializing thread pool of task scheduler."; - /* stop all waiting threads */ - TaskScheduler::queue_mutex.lock(); - do_exit = true; - TaskScheduler::queue_cond.notify_all(); - TaskScheduler::queue_mutex.unlock(); - - /* delete threads */ - foreach (thread *t, threads) { - t->join(); - delete t; - } - threads.clear(); + delete global_control; + global_control = nullptr; + active_num_threads = 0; } } void TaskScheduler::free_memory() { assert(users == 0); - threads.free_memory(); -} - -bool TaskScheduler::thread_wait_pop(Entry &entry) -{ - thread_scoped_lock queue_lock(queue_mutex); - - while (queue.empty() && !do_exit) - queue_cond.wait(queue_lock); - - if (queue.empty()) { - assert(do_exit); - return false; - } - - entry = queue.front(); - queue.pop_front(); - - return true; } -void TaskScheduler::thread_run(int thread_id) +int TaskScheduler::num_threads() { - Entry entry; - - /* todo: test affinity/denormal mask */ - - /* keep popping off tasks */ - while (thread_wait_pop(entry)) { - /* run task */ - entry.task->run(thread_id); - - /* delete task */ - delete entry.task; - - /* notify pool task was done */ - entry.pool->num_decrease(1); - } -} - -void TaskScheduler::push(Entry &entry, bool front) -{ - entry.pool->num_increase(); - - /* add entry to queue */ - TaskScheduler::queue_mutex.lock(); - if (front) - TaskScheduler::queue.push_front(entry); - else - TaskScheduler::queue.push_back(entry); - - TaskScheduler::queue_cond.notify_one(); - TaskScheduler::queue_mutex.unlock(); -} - -void TaskScheduler::clear(TaskPool *pool) -{ - thread_scoped_lock queue_lock(TaskScheduler::queue_mutex); - - /* erase all tasks from this pool from the queue */ - list<Entry>::iterator it = queue.begin(); - int done = 0; - - while (it != queue.end()) { - Entry &entry = *it; - - if (entry.pool == pool) { - done++; - delete entry.task; - - it = queue.erase(it); - } - else - it++; - } - - queue_lock.unlock(); - - /* notify done */ - pool->num_decrease(done); + return active_num_threads; } /* Dedicated Task Pool */ @@ -458,31 +122,30 @@ DedicatedTaskPool::DedicatedTaskPool() DedicatedTaskPool::~DedicatedTaskPool() { - stop(); + wait(); + + do_exit = true; + queue_cond.notify_all(); + worker_thread->join(); delete worker_thread; } -void DedicatedTaskPool::push(Task *task, bool front) +void DedicatedTaskPool::push(TaskRunFunction &&task, bool front) { num_increase(); /* add task to queue */ queue_mutex.lock(); if (front) - queue.push_front(task); + queue.emplace_front(std::move(task)); else - queue.push_back(task); + queue.emplace_back(std::move(task)); queue_cond.notify_one(); queue_mutex.unlock(); } -void DedicatedTaskPool::push(const TaskRunFunction &run, bool front) -{ - push(new Task(run), front); -} - void DedicatedTaskPool::wait() { thread_scoped_lock num_lock(num_mutex); @@ -501,18 +164,6 @@ void DedicatedTaskPool::cancel() do_cancel = false; } -void DedicatedTaskPool::stop() -{ - clear(); - - do_exit = true; - queue_cond.notify_all(); - - wait(); - - assert(num == 0); -} - bool DedicatedTaskPool::canceled() { return do_cancel; @@ -535,7 +186,7 @@ void DedicatedTaskPool::num_increase() num_cond.notify_all(); } -bool DedicatedTaskPool::thread_wait_pop(Task *&task) +bool DedicatedTaskPool::thread_wait_pop(TaskRunFunction &task) { thread_scoped_lock queue_lock(queue_mutex); @@ -555,15 +206,15 @@ bool DedicatedTaskPool::thread_wait_pop(Task *&task) void DedicatedTaskPool::thread_run() { - Task *task; + TaskRunFunction task; /* keep popping off tasks */ while (thread_wait_pop(task)) { /* run task */ - task->run(0); + task(); /* delete task */ - delete task; + task = nullptr; /* notify task was done */ num_decrease(1); @@ -575,15 +226,8 @@ void DedicatedTaskPool::clear() thread_scoped_lock queue_lock(queue_mutex); /* erase all tasks from the queue */ - list<Task *>::iterator it = queue.begin(); - int done = 0; - - while (it != queue.end()) { - done++; - delete *it; - - it = queue.erase(it); - } + int done = queue.size(); + queue.clear(); queue_lock.unlock(); diff --git a/intern/cycles/util/util_task.h b/intern/cycles/util/util_task.h index fd30a33d8ef..a56ca62f62c 100644 --- a/intern/cycles/util/util_task.h +++ b/intern/cycles/util/util_task.h @@ -19,48 +19,16 @@ #include "util/util_list.h" #include "util/util_string.h" +#include "util/util_tbb.h" #include "util/util_thread.h" #include "util/util_vector.h" CCL_NAMESPACE_BEGIN -class Task; class TaskPool; class TaskScheduler; -/* Notes on Thread ID - * - * Thread ID argument reports the 0-based ID of a working thread from which - * the run() callback is being invoked. Thread ID of 0 denotes the thread from - * which wait_work() was called. - * - * DO NOT use this ID to control execution flaw, use it only for things like - * emulating TLS which does not affect on scheduling. Don't use this ID to make - * any decisions. - * - * It is to be noted here that dedicated task pool will always report thread ID - * of 0. - */ - -typedef function<void(int thread_id)> TaskRunFunction; - -/* Task - * - * Base class for tasks to be executed in threads. */ - -class Task { - public: - Task(){}; - explicit Task(const TaskRunFunction &run_) : run(run_) - { - } - - virtual ~Task() - { - } - - TaskRunFunction run; -}; +typedef function<void(void)> TaskRunFunction; /* Task Pool * @@ -68,8 +36,7 @@ class Task { * pool, we can wait for all tasks to be done, or cancel them before they are * done. * - * The run callback that actually executes the task may be created like this: - * function_bind(&MyClass::task_execute, this, _1, _2) */ + * TaskRunFunction may be created with std::bind or lambda expressions. */ class TaskPool { public: @@ -89,27 +56,15 @@ class TaskPool { TaskPool(); ~TaskPool(); - void push(Task *task, bool front = false); - void push(const TaskRunFunction &run, bool front = false); + void push(TaskRunFunction &&task); void wait_work(Summary *stats = NULL); /* work and wait until all tasks are done */ - void cancel(); /* cancel all tasks, keep worker threads running */ - void stop(); /* stop all worker threads */ - bool finished(); /* check if all work has been completed */ + void cancel(); /* cancel all tasks and wait until they are no longer executing */ bool canceled(); /* for worker threads, test if canceled */ protected: - friend class TaskScheduler; - - void num_decrease(int done); - void num_increase(); - - thread_mutex num_mutex; - thread_condition_variable num_cond; - - int num; - bool do_cancel; + tbb::task_group tbb_group; /* ** Statistics ** */ @@ -131,40 +86,19 @@ class TaskScheduler { static void exit(); static void free_memory(); - /* number of threads that can work on task */ - static int num_threads() - { - return threads.size(); - } - - /* test if any session is using the scheduler */ - static bool active() - { - return users != 0; - } + /* Approximate number of threads that will work on task, which may be lower + * or higher than the actual number of threads. Use as little as possible and + * leave splitting up tasks to the scheduler.. */ + static int num_threads(); protected: - friend class TaskPool; - - struct Entry { - Task *task; - TaskPool *pool; - }; - static thread_mutex mutex; static int users; - static vector<thread *> threads; - static bool do_exit; + static int active_num_threads; - static list<Entry> queue; - static thread_mutex queue_mutex; - static thread_condition_variable queue_cond; - - static void thread_run(int thread_id); - static bool thread_wait_pop(Entry &entry); - - static void push(Entry &entry, bool front); - static void clear(TaskPool *pool); +#ifdef WITH_TBB_GLOBAL_CONTROL + static tbb::global_control *global_control; +#endif }; /* Dedicated Task Pool @@ -179,12 +113,10 @@ class DedicatedTaskPool { DedicatedTaskPool(); ~DedicatedTaskPool(); - void push(Task *task, bool front = false); - void push(const TaskRunFunction &run, bool front = false); + void push(TaskRunFunction &&run, bool front = false); void wait(); /* wait until all tasks are done */ void cancel(); /* cancel all tasks, keep worker thread running */ - void stop(); /* stop worker thread */ bool canceled(); /* for worker thread, test if canceled */ @@ -193,14 +125,14 @@ class DedicatedTaskPool { void num_increase(); void thread_run(); - bool thread_wait_pop(Task *&entry); + bool thread_wait_pop(TaskRunFunction &task); void clear(); thread_mutex num_mutex; thread_condition_variable num_cond; - list<Task *> queue; + list<TaskRunFunction> queue; thread_mutex queue_mutex; thread_condition_variable queue_cond; diff --git a/intern/cycles/util/util_tbb.h b/intern/cycles/util/util_tbb.h new file mode 100644 index 00000000000..301cb80c5b0 --- /dev/null +++ b/intern/cycles/util/util_tbb.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2020 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TBB_H__ +#define __UTIL_TBB_H__ + +/* TBB includes <windows.h>, do it ourselves first so we are sure + * WIN32_LEAN_AND_MEAN and similar are defined beforehand. */ +#include "util_windows.h" + +#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1 +#include <tbb/tbb.h> + +#if TBB_INTERFACE_VERSION_MAJOR >= 10 +# define WITH_TBB_GLOBAL_CONTROL +#endif + +CCL_NAMESPACE_BEGIN + +using tbb::blocked_range; +using tbb::enumerable_thread_specific; +using tbb::parallel_for; + +CCL_NAMESPACE_END + +#endif /* __UTIL_TBB_H__ */ diff --git a/intern/cycles/util/util_version.h b/intern/cycles/util/util_version.h index bb2c99cc6d7..8bce5ff85aa 100644 --- a/intern/cycles/util/util_version.h +++ b/intern/cycles/util/util_version.h @@ -22,7 +22,7 @@ CCL_NAMESPACE_BEGIN #define CYCLES_VERSION_MAJOR 1 -#define CYCLES_VERSION_MINOR 12 +#define CYCLES_VERSION_MINOR 13 #define CYCLES_VERSION_PATCH 0 #define CYCLES_MAKE_VERSION_STRING2(a, b, c) #a "." #b "." #c |