diff options
Diffstat (limited to 'intern/cycles')
155 files changed, 5143 insertions, 4960 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index c53a9f91cc0..5844c2480d6 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -41,61 +41,65 @@ elseif(WIN32 AND MSVC) set(CYCLES_AVX2_ARCH_FLAGS "/arch:SSE2") endif() + # Unlike GCC/clang we still use fast math, because there is no fine + # grained control and the speedup we get here is too big to ignore. + set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + # there is no /arch:SSE3, but intrinsics are available anyway if(CMAKE_CL_64) - set(CYCLES_SSE2_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}") + set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}") + set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}") + set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") else() - set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}") + set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}") + set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}") + set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CYCLES_KERNEL_FLAGS}") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Ox") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Ox") set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /Ox") - - set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") -elseif(CMAKE_COMPILER_IS_GNUCC) +elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) check_cxx_compiler_flag(-msse CXX_HAS_SSE) check_cxx_compiler_flag(-mavx CXX_HAS_AVX) check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2) - set(CYCLES_KERNEL_FLAGS "-ffast-math") - if(CXX_HAS_SSE) - set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse") - set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse") - set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse") - endif() - if(CXX_HAS_AVX) - set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse") - endif() - if(CXX_HAS_AVX2) - set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c -mfpmath=sse") + + # Assume no signal trapping for better code generation. + set(CYCLES_KERNEL_FLAGS "-fno-trapping-math") + # Avoid overhead of setting errno for NaNs. + set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-math-errno") + # Let compiler optimize 0.0 - x without worrying about signed zeros. + set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-signed-zeros") + + if(CMAKE_COMPILER_IS_GNUCC) + # Assume no signal trapping for better code generation. + set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-signaling-nans") + # Assume a fixed rounding mode for better constant folding. + set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-rounding-math") endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only") -elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") - check_cxx_compiler_flag(-msse CXX_HAS_SSE) - check_cxx_compiler_flag(-mavx CXX_HAS_AVX) - check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2) - set(CYCLES_KERNEL_FLAGS "-ffast-math") + if(CXX_HAS_SSE) - set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2") - set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3") - set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1") - endif() - if(CXX_HAS_AVX) - set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx") - endif() - if(CXX_HAS_AVX2) - set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c") + if(CMAKE_COMPILER_IS_GNUCC) + set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -mfpmath=sse") + endif() + + set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2") + set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3") + set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS} -msse4.1") + if(CXX_HAS_AVX) + set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx") + endif() + if(CXX_HAS_AVX2) + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c") + endif() endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only") + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CYCLES_KERNEL_FLAGS}") endif() if(CXX_HAS_SSE) diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py index 82c4ffc6e50..17efb00abdb 100644 --- a/intern/cycles/blender/addon/presets.py +++ b/intern/cycles/blender/addon/presets.py @@ -32,14 +32,11 @@ class AddPresetIntegrator(AddPresetBase, Operator): preset_values = [ "cycles.max_bounces", - "cycles.min_bounces", "cycles.diffuse_bounces", "cycles.glossy_bounces", "cycles.transmission_bounces", "cycles.volume_bounces", - "cycles.transparent_min_bounces", "cycles.transparent_max_bounces", - "cycles.use_transparent_shadows", "cycles.caustics_reflective", "cycles.caustics_refractive", "cycles.blur_glossy" diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 68474529ed3..7b16ef1d543 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -205,13 +205,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): name="AA Samples", description="Number of antialiasing samples to render for each pixel", min=1, max=2097151, - default=4, + default=128, ) cls.preview_aa_samples = IntProperty( name="AA Samples", description="Number of antialiasing samples to render in the viewport, unlimited if 0", min=0, max=2097151, - default=4, + default=32, ) cls.diffuse_samples = IntProperty( name="Diffuse Samples", @@ -308,17 +308,9 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): description="Adaptively blur glossy shaders after blurry bounces, " "to reduce noise at the cost of accuracy", min=0.0, max=10.0, - default=0.0, + default=1.0, ) - cls.min_bounces = IntProperty( - name="Min Bounces", - description="Minimum number of bounces, setting this lower " - "than the maximum enables probabilistic path " - "termination (faster but noisier)", - min=0, max=1024, - default=3, - ) cls.max_bounces = IntProperty( name="Max Bounces", description="Total maximum number of bounces", @@ -351,26 +343,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): default=0, ) - cls.transparent_min_bounces = IntProperty( - name="Transparent Min Bounces", - description="Minimum number of transparent bounces, setting " - "this lower than the maximum enables " - "probabilistic path termination (faster but " - "noisier)", - min=0, max=1024, - default=8, - ) cls.transparent_max_bounces = IntProperty( name="Transparent Max Bounces", description="Maximum number of transparent bounces", min=0, max=1024, default=8, ) - cls.use_transparent_shadows = BoolProperty( - name="Transparent Shadows", - description="Use transparency of surfaces for rendering shadows", - default=True, - ) cls.volume_step_size = FloatProperty( name="Step Size", @@ -475,7 +453,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): "higher values will be scaled down to avoid too " "much noise and slow convergence at the cost of accuracy", min=0.0, max=1e8, - default=0.0, + default=10.0, ) cls.debug_tile_size = IntProperty( diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 49beebe5ab4..7ab47455c49 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -139,7 +139,7 @@ def draw_samples_info(layout, context): (ao * aa, ml * aa, sss * aa, vol * aa)) -class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel): bl_label = "Sampling" bl_options = {'DEFAULT_CLOSED'} @@ -214,7 +214,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): draw_samples_info(layout, context) -class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_geometry(CyclesButtonsPanel, Panel): bl_label = "Geometry" bl_options = {'DEFAULT_CLOSED'} @@ -270,7 +270,7 @@ class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel): row.prop(ccscene, "maximum_width", text="Max Extension") -class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_light_paths(CyclesButtonsPanel, Panel): bl_label = "Light Paths" bl_options = {'DEFAULT_CLOSED'} @@ -292,8 +292,6 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel): sub = col.column(align=True) sub.label("Transparency:") sub.prop(cscene, "transparent_max_bounces", text="Max") - sub.prop(cscene, "transparent_min_bounces", text="Min") - sub.prop(cscene, "use_transparent_shadows", text="Shadows") col.separator() @@ -306,7 +304,6 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel): sub = col.column(align=True) sub.label(text="Bounces:") sub.prop(cscene, "max_bounces", text="Max") - sub.prop(cscene, "min_bounces", text="Min") sub = col.column(align=True) sub.prop(cscene, "diffuse_bounces", text="Diffuse") @@ -315,7 +312,7 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel): sub.prop(cscene, "volume_bounces", text="Volume") -class CyclesRender_PT_motion_blur(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_motion_blur(CyclesButtonsPanel, Panel): bl_label = "Motion Blur" bl_options = {'DEFAULT_CLOSED'} @@ -356,7 +353,7 @@ class CyclesRender_PT_motion_blur(CyclesButtonsPanel, Panel): row.prop(cscene, "rolling_shutter_duration") -class CyclesRender_PT_film(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_film(CyclesButtonsPanel, Panel): bl_label = "Film" def draw(self, context): @@ -378,7 +375,7 @@ class CyclesRender_PT_film(CyclesButtonsPanel, Panel): sub.prop(cscene, "filter_width", text="Width") -class CyclesRender_PT_performance(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_performance(CyclesButtonsPanel, Panel): bl_label = "Performance" bl_options = {'DEFAULT_CLOSED'} @@ -399,6 +396,8 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel): sub.enabled = rd.threads_mode == 'FIXED' sub.prop(rd, "threads") + col.separator() + sub = col.column(align=True) sub.label(text="Tiles:") sub.prop(cscene, "tile_order", text="") @@ -408,19 +407,10 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel): sub.prop(cscene, "use_progressive_refine") - subsub = sub.column(align=True) - subsub.prop(rd, "use_save_buffers") - - col = split.column(align=True) - - col.label(text="Viewport:") - col.prop(cscene, "debug_bvh_type", text="") - col.separator() - col.prop(cscene, "preview_start_resolution") - - col.separator() + col = split.column() col.label(text="Final Render:") + col.prop(rd, "use_save_buffers") col.prop(rd, "use_persistent_data", text="Persistent Images") col.separator() @@ -433,8 +423,14 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel): row.active = not cscene.debug_use_spatial_splits row.prop(cscene, "debug_bvh_time_steps") + col = layout.column() + col.label(text="Viewport Resolution:") + split = col.split() + split.prop(rd, "preview_pixel_size", text="") + split.prop(cscene, "preview_start_resolution") + -class CyclesRender_PT_layer_options(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_layer_options(CyclesButtonsPanel, Panel): bl_label = "Layer" bl_context = "render_layer" @@ -470,7 +466,7 @@ class CyclesRender_PT_layer_options(CyclesButtonsPanel, Panel): col.prop(rl, "use_strand", "Use Hair") -class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_layer_passes(CyclesButtonsPanel, Panel): bl_label = "Passes" bl_context = "render_layer" bl_options = {'DEFAULT_CLOSED'} @@ -544,7 +540,7 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel): col.prop(crl, "pass_debug_ray_bounces") -class CyclesRender_PT_views(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_views(CyclesButtonsPanel, Panel): bl_label = "Views" bl_context = "render_layer" bl_options = {'DEFAULT_CLOSED'} @@ -587,7 +583,7 @@ class CyclesRender_PT_views(CyclesButtonsPanel, Panel): row.prop(rv, "camera_suffix", text="") -class CyclesRender_PT_denoising(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel): bl_label = "Denoising" bl_context = "render_layer" bl_options = {'DEFAULT_CLOSED'} @@ -652,7 +648,7 @@ class CyclesRender_PT_denoising(CyclesButtonsPanel, Panel): sub.prop(crl, "denoising_subsurface_indirect", text="Indirect", toggle=True) -class Cycles_PT_post_processing(CyclesButtonsPanel, Panel): +class CYCLES_PT_post_processing(CyclesButtonsPanel, Panel): bl_label = "Post Processing" bl_options = {'DEFAULT_CLOSED'} @@ -671,7 +667,7 @@ class Cycles_PT_post_processing(CyclesButtonsPanel, Panel): col.prop(rd, "dither_intensity", text="Dither", slider=True) -class CyclesCamera_PT_dof(CyclesButtonsPanel, Panel): +class CYCLES_CAMERA_PT_dof(CyclesButtonsPanel, Panel): bl_label = "Depth of Field" bl_context = "data" @@ -722,7 +718,7 @@ class CyclesCamera_PT_dof(CyclesButtonsPanel, Panel): sub.prop(ccam, "aperture_ratio", text="Ratio") -class Cycles_PT_context_material(CyclesButtonsPanel, Panel): +class CYCLES_PT_context_material(CyclesButtonsPanel, Panel): bl_label = "" bl_context = "material" bl_options = {'HIDE_HEADER'} @@ -782,7 +778,7 @@ class Cycles_PT_context_material(CyclesButtonsPanel, Panel): split.separator() -class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel): +class CYCLES_OBJECT_PT_motion_blur(CyclesButtonsPanel, Panel): bl_label = "Motion Blur" bl_context = "object" bl_options = {'DEFAULT_CLOSED'} @@ -830,7 +826,7 @@ class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel): sub.prop(cob, "motion_steps", text="Steps") -class CyclesObject_PT_cycles_settings(CyclesButtonsPanel, Panel): +class CYCLES_OBJECT_PT_cycles_settings(CyclesButtonsPanel, Panel): bl_label = "Cycles Settings" bl_context = "object" bl_options = {'DEFAULT_CLOSED'} @@ -939,7 +935,7 @@ def panel_node_draw(layout, id_data, output_type, input_name): return True -class CyclesLamp_PT_preview(CyclesButtonsPanel, Panel): +class CYCLES_LAMP_PT_preview(CyclesButtonsPanel, Panel): bl_label = "Preview" bl_context = "data" bl_options = {'DEFAULT_CLOSED'} @@ -955,7 +951,7 @@ class CyclesLamp_PT_preview(CyclesButtonsPanel, Panel): self.layout.template_preview(context.lamp) -class CyclesLamp_PT_lamp(CyclesButtonsPanel, Panel): +class CYCLES_LAMP_PT_lamp(CyclesButtonsPanel, Panel): bl_label = "Lamp" bl_context = "data" @@ -1009,7 +1005,7 @@ class CyclesLamp_PT_lamp(CyclesButtonsPanel, Panel): layout.label(text="Not supported, interpreted as sun lamp") -class CyclesLamp_PT_nodes(CyclesButtonsPanel, Panel): +class CYCLES_LAMP_PT_nodes(CyclesButtonsPanel, Panel): bl_label = "Nodes" bl_context = "data" @@ -1027,7 +1023,7 @@ class CyclesLamp_PT_nodes(CyclesButtonsPanel, Panel): layout.prop(lamp, "color") -class CyclesLamp_PT_spot(CyclesButtonsPanel, Panel): +class CYCLES_LAMP_PT_spot(CyclesButtonsPanel, Panel): bl_label = "Spot Shape" bl_context = "data" @@ -1052,7 +1048,7 @@ class CyclesLamp_PT_spot(CyclesButtonsPanel, Panel): col.prop(lamp, "show_cone") -class CyclesWorld_PT_preview(CyclesButtonsPanel, Panel): +class CYCLES_WORLD_PT_preview(CyclesButtonsPanel, Panel): bl_label = "Preview" bl_context = "world" bl_options = {'DEFAULT_CLOSED'} @@ -1065,7 +1061,7 @@ class CyclesWorld_PT_preview(CyclesButtonsPanel, Panel): self.layout.template_preview(context.world) -class CyclesWorld_PT_surface(CyclesButtonsPanel, Panel): +class CYCLES_WORLD_PT_surface(CyclesButtonsPanel, Panel): bl_label = "Surface" bl_context = "world" @@ -1082,7 +1078,7 @@ class CyclesWorld_PT_surface(CyclesButtonsPanel, Panel): layout.prop(world, "horizon_color", text="Color") -class CyclesWorld_PT_volume(CyclesButtonsPanel, Panel): +class CYCLES_WORLD_PT_volume(CyclesButtonsPanel, Panel): bl_label = "Volume" bl_context = "world" bl_options = {'DEFAULT_CLOSED'} @@ -1099,7 +1095,7 @@ class CyclesWorld_PT_volume(CyclesButtonsPanel, Panel): panel_node_draw(layout, world, 'OUTPUT_WORLD', 'Volume') -class CyclesWorld_PT_ambient_occlusion(CyclesButtonsPanel, Panel): +class CYCLES_WORLD_PT_ambient_occlusion(CyclesButtonsPanel, Panel): bl_label = "Ambient Occlusion" bl_context = "world" @@ -1124,7 +1120,7 @@ class CyclesWorld_PT_ambient_occlusion(CyclesButtonsPanel, Panel): row.prop(light, "distance", text="Distance") -class CyclesWorld_PT_mist(CyclesButtonsPanel, Panel): +class CYCLES_WORLD_PT_mist(CyclesButtonsPanel, Panel): bl_label = "Mist Pass" bl_context = "world" bl_options = {'DEFAULT_CLOSED'} @@ -1151,7 +1147,7 @@ class CyclesWorld_PT_mist(CyclesButtonsPanel, Panel): layout.prop(world.mist_settings, "falloff") -class CyclesWorld_PT_ray_visibility(CyclesButtonsPanel, Panel): +class CYCLES_WORLD_PT_ray_visibility(CyclesButtonsPanel, Panel): bl_label = "Ray Visibility" bl_context = "world" bl_options = {'DEFAULT_CLOSED'} @@ -1175,7 +1171,7 @@ class CyclesWorld_PT_ray_visibility(CyclesButtonsPanel, Panel): flow.prop(visibility, "scatter") -class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel): +class CYCLES_WORLD_PT_settings(CyclesButtonsPanel, Panel): bl_label = "Settings" bl_context = "world" bl_options = {'DEFAULT_CLOSED'} @@ -1216,7 +1212,7 @@ class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel): col.prop(cworld, "homogeneous_volume", text="Homogeneous") -class CyclesMaterial_PT_preview(CyclesButtonsPanel, Panel): +class CYCLES_MATERIAL_PT_preview(CyclesButtonsPanel, Panel): bl_label = "Preview" bl_context = "material" bl_options = {'DEFAULT_CLOSED'} @@ -1229,7 +1225,7 @@ class CyclesMaterial_PT_preview(CyclesButtonsPanel, Panel): self.layout.template_preview(context.material) -class CyclesMaterial_PT_surface(CyclesButtonsPanel, Panel): +class CYCLES_MATERIAL_PT_surface(CyclesButtonsPanel, Panel): bl_label = "Surface" bl_context = "material" @@ -1245,7 +1241,7 @@ class CyclesMaterial_PT_surface(CyclesButtonsPanel, Panel): layout.prop(mat, "diffuse_color") -class CyclesMaterial_PT_volume(CyclesButtonsPanel, Panel): +class CYCLES_MATERIAL_PT_volume(CyclesButtonsPanel, Panel): bl_label = "Volume" bl_context = "material" bl_options = {'DEFAULT_CLOSED'} @@ -1264,7 +1260,7 @@ class CyclesMaterial_PT_volume(CyclesButtonsPanel, Panel): panel_node_draw(layout, mat, 'OUTPUT_MATERIAL', 'Volume') -class CyclesMaterial_PT_displacement(CyclesButtonsPanel, Panel): +class CYCLES_MATERIAL_PT_displacement(CyclesButtonsPanel, Panel): bl_label = "Displacement" bl_context = "material" @@ -1280,7 +1276,7 @@ class CyclesMaterial_PT_displacement(CyclesButtonsPanel, Panel): panel_node_draw(layout, mat, 'OUTPUT_MATERIAL', 'Displacement') -class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel): +class CYCLES_MATERIAL_PT_settings(CyclesButtonsPanel, Panel): bl_label = "Settings" bl_context = "material" bl_options = {'DEFAULT_CLOSED'} @@ -1335,7 +1331,7 @@ class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel): col.prop(mat, "pass_index") -class CyclesTexture_PT_context(CyclesButtonsPanel, Panel): +class CYCLES_TEXTURE_PT_context(CyclesButtonsPanel, Panel): bl_label = "" bl_context = "texture" bl_options = {'HIDE_HEADER'} @@ -1376,7 +1372,7 @@ class CyclesTexture_PT_context(CyclesButtonsPanel, Panel): split.prop(tex, "type", text="") -class CyclesTexture_PT_node(CyclesButtonsPanel, Panel): +class CYCLES_TEXTURE_PT_node(CyclesButtonsPanel, Panel): bl_label = "Node" bl_context = "texture" @@ -1393,7 +1389,7 @@ class CyclesTexture_PT_node(CyclesButtonsPanel, Panel): layout.template_node_view(ntree, node, None) -class CyclesTexture_PT_mapping(CyclesButtonsPanel, Panel): +class CYCLES_TEXTURE_PT_mapping(CyclesButtonsPanel, Panel): bl_label = "Mapping" bl_context = "texture" @@ -1426,7 +1422,7 @@ class CyclesTexture_PT_mapping(CyclesButtonsPanel, Panel): row.prop(mapping, "mapping_z", text="") -class CyclesTexture_PT_colors(CyclesButtonsPanel, Panel): +class CYCLES_TEXTURE_PT_colors(CyclesButtonsPanel, Panel): bl_label = "Color" bl_context = "texture" bl_options = {'DEFAULT_CLOSED'} @@ -1465,7 +1461,7 @@ class CyclesTexture_PT_colors(CyclesButtonsPanel, Panel): layout.template_color_ramp(mapping, "color_ramp", expand=True) -class CyclesParticle_PT_textures(CyclesButtonsPanel, Panel): +class CYCLES_PARTICLE_PT_textures(CyclesButtonsPanel, Panel): bl_label = "Textures" bl_context = "particle" bl_options = {'DEFAULT_CLOSED'} @@ -1496,7 +1492,7 @@ class CyclesParticle_PT_textures(CyclesButtonsPanel, Panel): layout.template_ID(slot, "texture", new="texture.new") -class CyclesRender_PT_bake(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_bake(CyclesButtonsPanel, Panel): bl_label = "Bake" bl_context = "render" bl_options = {'DEFAULT_CLOSED'} @@ -1569,7 +1565,7 @@ class CyclesRender_PT_bake(CyclesButtonsPanel, Panel): sub.prop(cbk, "cage_extrusion", text="Ray Distance") -class CyclesRender_PT_debug(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel): bl_label = "Debug" bl_context = "render" bl_options = {'DEFAULT_CLOSED'} @@ -1597,11 +1593,15 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel): col.prop(cscene, "debug_use_qbvh") col.prop(cscene, "debug_use_cpu_split_kernel") + col.separator() + col = layout.column() col.label('CUDA Flags:') col.prop(cscene, "debug_use_cuda_adaptive_compile") col.prop(cscene, "debug_use_cuda_split_kernel") + col.separator() + col = layout.column() col.label('OpenCL Flags:') col.prop(cscene, "debug_opencl_kernel_type", text="Kernel") @@ -1610,8 +1610,13 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel): col.prop(cscene, "debug_use_opencl_debug", text="Debug") col.prop(cscene, "debug_opencl_mem_limit") + col.separator() + + col = layout.column() + col.prop(cscene, "debug_bvh_type") + -class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel): +class CYCLES_PARTICLE_PT_curve_settings(CyclesButtonsPanel, Panel): bl_label = "Cycles Hair Settings" bl_context = "particle" @@ -1642,7 +1647,7 @@ class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel): row.prop(cpsys, "use_closetip", text="Close tip") -class CyclesScene_PT_simplify(CyclesButtonsPanel, Panel): +class CYCLES_SCENE_PT_simplify(CyclesButtonsPanel, Panel): bl_label = "Simplify" bl_context = "scene" COMPAT_ENGINES = {'CYCLES'} @@ -1797,47 +1802,47 @@ def get_panels(): classes = ( CYCLES_MT_sampling_presets, CYCLES_MT_integrator_presets, - CyclesRender_PT_sampling, - CyclesRender_PT_geometry, - CyclesRender_PT_light_paths, - CyclesRender_PT_motion_blur, - CyclesRender_PT_film, - CyclesRender_PT_performance, - CyclesRender_PT_layer_options, - CyclesRender_PT_layer_passes, - CyclesRender_PT_views, - CyclesRender_PT_denoising, - Cycles_PT_post_processing, - CyclesCamera_PT_dof, - Cycles_PT_context_material, - CyclesObject_PT_motion_blur, - CyclesObject_PT_cycles_settings, + CYCLES_RENDER_PT_sampling, + CYCLES_RENDER_PT_geometry, + CYCLES_RENDER_PT_light_paths, + CYCLES_RENDER_PT_motion_blur, + CYCLES_RENDER_PT_film, + CYCLES_RENDER_PT_performance, + CYCLES_RENDER_PT_layer_options, + CYCLES_RENDER_PT_layer_passes, + CYCLES_RENDER_PT_views, + CYCLES_RENDER_PT_denoising, + CYCLES_PT_post_processing, + CYCLES_CAMERA_PT_dof, + CYCLES_PT_context_material, + CYCLES_OBJECT_PT_motion_blur, + CYCLES_OBJECT_PT_cycles_settings, CYCLES_OT_use_shading_nodes, - CyclesLamp_PT_preview, - CyclesLamp_PT_lamp, - CyclesLamp_PT_nodes, - CyclesLamp_PT_spot, - CyclesWorld_PT_preview, - CyclesWorld_PT_surface, - CyclesWorld_PT_volume, - CyclesWorld_PT_ambient_occlusion, - CyclesWorld_PT_mist, - CyclesWorld_PT_ray_visibility, - CyclesWorld_PT_settings, - CyclesMaterial_PT_preview, - CyclesMaterial_PT_surface, - CyclesMaterial_PT_volume, - CyclesMaterial_PT_displacement, - CyclesMaterial_PT_settings, - CyclesTexture_PT_context, - CyclesTexture_PT_node, - CyclesTexture_PT_mapping, - CyclesTexture_PT_colors, - CyclesParticle_PT_textures, - CyclesRender_PT_bake, - CyclesRender_PT_debug, - CyclesParticle_PT_CurveSettings, - CyclesScene_PT_simplify, + CYCLES_LAMP_PT_preview, + CYCLES_LAMP_PT_lamp, + CYCLES_LAMP_PT_nodes, + CYCLES_LAMP_PT_spot, + CYCLES_WORLD_PT_preview, + CYCLES_WORLD_PT_surface, + CYCLES_WORLD_PT_volume, + CYCLES_WORLD_PT_ambient_occlusion, + CYCLES_WORLD_PT_mist, + CYCLES_WORLD_PT_ray_visibility, + CYCLES_WORLD_PT_settings, + CYCLES_MATERIAL_PT_preview, + CYCLES_MATERIAL_PT_surface, + CYCLES_MATERIAL_PT_volume, + CYCLES_MATERIAL_PT_displacement, + CYCLES_MATERIAL_PT_settings, + CYCLES_TEXTURE_PT_context, + CYCLES_TEXTURE_PT_node, + CYCLES_TEXTURE_PT_mapping, + CYCLES_TEXTURE_PT_colors, + CYCLES_PARTICLE_PT_textures, + CYCLES_RENDER_PT_bake, + CYCLES_RENDER_PT_debug, + CYCLES_PARTICLE_PT_curve_settings, + CYCLES_SCENE_PT_simplify, ) diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py index b2a745500a1..efd794461d6 100644 --- a/intern/cycles/blender/addon/version_update.py +++ b/intern/cycles/blender/addon/version_update.py @@ -302,3 +302,16 @@ def do_versions(self): cscene = scene.cycles if not cscene.is_property_set("light_sampling_threshold"): cscene.light_sampling_threshold = 0.0 + + if bpy.data.version <= (2, 79, 0): + for scene in bpy.data.scenes: + cscene = scene.cycles + # Default changes + if not cscene.is_property_set("aa_samples"): + cscene.aa_samples = 4 + if not cscene.is_property_set("preview_aa_samples"): + cscene.preview_aa_samples = 4 + if not cscene.is_property_set("blur_glossy"): + cscene.blur_glossy = 0.0 + if not cscene.is_property_set("sample_clamp_indirect"): + cscene.sample_clamp_indirect = 0.0 diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp index 40d6b25f2b7..b29711d30d3 100644 --- a/intern/cycles/blender/blender_camera.cpp +++ b/intern/cycles/blender/blender_camera.cpp @@ -544,7 +544,11 @@ void BlenderSync::sync_camera_motion(BL::RenderSettings& b_render, if(tfm != cam->matrix) { VLOG(1) << "Camera " << b_ob.name() << " motion detected."; - if(motion_time == -1.0f) { + if(motion_time == 0.0f) { + /* When motion blur is not centered in frame, cam->matrix gets reset. */ + cam->matrix = tfm; + } + else if(motion_time == -1.0f) { cam->motion.pre = tfm; cam->use_motion = true; } @@ -573,7 +577,10 @@ void BlenderSync::sync_camera_motion(BL::RenderSettings& b_render, float fov = 2.0f * atanf((0.5f * sensor_size) / bcam.lens / aspectratio); if(fov != cam->fov) { VLOG(1) << "Camera " << b_ob.name() << " FOV change detected."; - if(motion_time == -1.0f) { + if(motion_time == 0.0f) { + cam->fov = fov; + } + else if(motion_time == -1.0f) { cam->fov_pre = fov; cam->use_perspective_motion = true; } diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp index 3ebe2d8cf34..4091c44d379 100644 --- a/intern/cycles/blender/blender_mesh.cpp +++ b/intern/cycles/blender/blender_mesh.cpp @@ -50,8 +50,7 @@ enum { * Two triangles has vertex indices in the original Blender-side face. * If face is already a quad tri_b will not be initialized. */ -inline void face_split_tri_indices(const int num_verts, - const int face_flag, +inline void face_split_tri_indices(const int face_flag, int tri_a[3], int tri_b[3]) { @@ -59,36 +58,37 @@ inline void face_split_tri_indices(const int num_verts, tri_a[0] = 0; tri_a[1] = 1; tri_a[2] = 3; - if(num_verts == 4) { - tri_b[0] = 2; - tri_b[1] = 3; - tri_b[2] = 1; - } + + tri_b[0] = 2; + tri_b[1] = 3; + tri_b[2] = 1; } - else /*if(face_flag & FACE_FLAG_DIVIDE_13)*/ { + else { + /* Quad with FACE_FLAG_DIVIDE_13 or single triangle. */ tri_a[0] = 0; tri_a[1] = 1; tri_a[2] = 2; - if(num_verts == 4) { - tri_b[0] = 0; - tri_b[1] = 2; - tri_b[2] = 3; - } + + tri_b[0] = 0; + tri_b[1] = 2; + tri_b[2] = 3; } } /* Tangent Space */ struct MikkUserData { - MikkUserData(const BL::Mesh& mesh_, - BL::MeshTextureFaceLayer *layer_, - int num_faces_) - : mesh(mesh_), layer(layer_), num_faces(num_faces_) + MikkUserData(const BL::Mesh& b_mesh, + BL::MeshTextureFaceLayer *layer, + int num_faces) + : b_mesh(b_mesh), + layer(layer), + num_faces(num_faces) { tangent.resize(num_faces*4); } - BL::Mesh mesh; + BL::Mesh b_mesh; BL::MeshTextureFaceLayer *layer; int num_faces; vector<float4> tangent; @@ -103,7 +103,7 @@ static int mikk_get_num_faces(const SMikkTSpaceContext *context) static int mikk_get_num_verts_of_face(const SMikkTSpaceContext *context, const int face_num) { MikkUserData *userdata = (MikkUserData*)context->m_pUserData; - BL::MeshTessFace f = userdata->mesh.tessfaces[face_num]; + BL::MeshTessFace f = userdata->b_mesh.tessfaces[face_num]; int4 vi = get_int4(f.vertices_raw()); return (vi[3] == 0)? 3: 4; @@ -112,9 +112,9 @@ static int mikk_get_num_verts_of_face(const SMikkTSpaceContext *context, const i static void mikk_get_position(const SMikkTSpaceContext *context, float P[3], const int face_num, const int vert_num) { MikkUserData *userdata = (MikkUserData*)context->m_pUserData; - BL::MeshTessFace f = userdata->mesh.tessfaces[face_num]; + BL::MeshTessFace f = userdata->b_mesh.tessfaces[face_num]; int4 vi = get_int4(f.vertices_raw()); - BL::MeshVertex v = userdata->mesh.vertices[vi[vert_num]]; + BL::MeshVertex v = userdata->b_mesh.vertices[vi[vert_num]]; float3 vP = get_float3(v.co()); P[0] = vP.x; @@ -148,9 +148,9 @@ static void mikk_get_texture_coordinate(const SMikkTSpaceContext *context, float uv[1] = tfuv.y; } else { - int vert_idx = userdata->mesh.tessfaces[face_num].vertices()[vert_num]; + int vert_idx = userdata->b_mesh.tessfaces[face_num].vertices()[vert_num]; float3 orco = - get_float3(userdata->mesh.vertices[vert_idx].undeformed_co()); + get_float3(userdata->b_mesh.vertices[vert_idx].undeformed_co()); float2 tmp = map_to_sphere(make_float3(orco[0], orco[1], orco[2])); uv[0] = tmp.x; uv[1] = tmp.y; @@ -160,12 +160,12 @@ static void mikk_get_texture_coordinate(const SMikkTSpaceContext *context, float static void mikk_get_normal(const SMikkTSpaceContext *context, float N[3], const int face_num, const int vert_num) { MikkUserData *userdata = (MikkUserData*)context->m_pUserData; - BL::MeshTessFace f = userdata->mesh.tessfaces[face_num]; + BL::MeshTessFace f = userdata->b_mesh.tessfaces[face_num]; float3 vN; if(f.use_smooth()) { int4 vi = get_int4(f.vertices_raw()); - BL::MeshVertex v = userdata->mesh.vertices[vi[vert_num]]; + BL::MeshVertex v = userdata->b_mesh.vertices[vi[vert_num]]; vN = get_float3(v.normal()); } else { @@ -250,7 +250,7 @@ static void mikk_compute_tangents(BL::Mesh& b_mesh, for(int i = 0; i < nverts.size(); i++) { int tri_a[3], tri_b[3]; - face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b); + face_split_tri_indices(face_flags[i], tri_a, tri_b); tangent[0] = float4_to_float3(userdata.tangent[i*4 + tri_a[0]]); tangent[1] = float4_to_float3(userdata.tangent[i*4 + tri_a[1]]); @@ -376,7 +376,7 @@ static void attr_create_vertex_color(Scene *scene, for(l->data.begin(c); c != l->data.end(); ++c, ++i) { int tri_a[3], tri_b[3]; - face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b); + face_split_tri_indices(face_flags[i], tri_a, tri_b); uchar4 colors[4]; colors[0] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color1()))); @@ -469,7 +469,7 @@ static void attr_create_uv_map(Scene *scene, for(l->data.begin(t); t != l->data.end(); ++t, ++i) { int tri_a[3], tri_b[3]; - face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b); + face_split_tri_indices(face_flags[i], tri_a, tri_b); float3 uvs[4]; uvs[0] = get_float3(t->uv1()); @@ -719,6 +719,11 @@ static void create_mesh(Scene *scene, int numngons = 0; bool use_loop_normals = b_mesh.use_auto_smooth() && (mesh->subdivision_type != Mesh::SUBDIVISION_CATMULL_CLARK); + /* If no faces, create empty mesh. */ + if(numfaces == 0) { + return; + } + BL::Mesh::vertices_iterator v; BL::Mesh::tessfaces_iterator f; BL::Mesh::polygons_iterator p; @@ -1079,7 +1084,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob, } /* free derived mesh */ - b_data.meshes.remove(b_mesh, false); + b_data.meshes.remove(b_mesh, false, true, false); } } mesh->geometry_flags = requested_geometry_flags; @@ -1299,7 +1304,7 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob, sync_curves(mesh, b_mesh, b_ob, true, time_index); /* free derived mesh */ - b_data.meshes.remove(b_mesh, false); + b_data.meshes.remove(b_mesh, false, true, false); } CCL_NAMESPACE_END diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp index a930c439370..63138c060fb 100644 --- a/intern/cycles/blender/blender_object.cpp +++ b/intern/cycles/blender/blender_object.cpp @@ -63,8 +63,25 @@ bool BlenderSync::object_is_mesh(BL::Object& b_ob) { BL::ID b_ob_data = b_ob.data(); - return (b_ob_data && (b_ob_data.is_a(&RNA_Mesh) || - b_ob_data.is_a(&RNA_Curve) || b_ob_data.is_a(&RNA_MetaBall))); + if(!b_ob_data) { + return false; + } + + if(b_ob.type() == BL::Object::type_CURVE) { + /* Skip exporting curves without faces, overhead can be + * significant if there are many for path animation. */ + BL::Curve b_curve(b_ob.data()); + + return (b_curve.bevel_object() || + b_curve.extrude() != 0.0f || + b_curve.bevel_depth() != 0.0f || + b_ob.modifiers.length()); + } + else { + return (b_ob_data.is_a(&RNA_Mesh) || + b_ob_data.is_a(&RNA_Curve) || + b_ob_data.is_a(&RNA_MetaBall)); + } } bool BlenderSync::object_is_light(BL::Object& b_ob) @@ -268,6 +285,29 @@ Object *BlenderSync::sync_object(BL::Object& b_parent, return NULL; } + /* Visibility flags for both parent and child. */ + bool use_holdout = (layer_flag & render_layer.holdout_layer) != 0; + uint visibility = object_ray_visibility(b_ob) & PATH_RAY_ALL_VISIBILITY; + + if(b_parent.ptr.data != b_ob.ptr.data) { + visibility &= object_ray_visibility(b_parent); + } + + /* Make holdout objects on excluded layer invisible for non-camera rays. */ + if(use_holdout && (layer_flag & render_layer.exclude_layer)) { + visibility &= ~(PATH_RAY_ALL_VISIBILITY - PATH_RAY_CAMERA); + } + + /* Hide objects not on render layer from camera rays. */ + if(!(layer_flag & render_layer.layer)) { + visibility &= ~PATH_RAY_CAMERA; + } + + /* Don't export completely invisible objects. */ + if(visibility == 0) { + return NULL; + } + /* key to lookup object */ ObjectKey key(b_parent, persistent_id, b_ob); Object *object; @@ -308,8 +348,6 @@ Object *BlenderSync::sync_object(BL::Object& b_parent, if(object_map.sync(&object, b_ob, b_parent, key)) object_updated = true; - bool use_holdout = (layer_flag & render_layer.holdout_layer) != 0; - /* mesh sync */ object->mesh = sync_mesh(b_ob, object_updated, hide_tris); @@ -322,22 +360,6 @@ Object *BlenderSync::sync_object(BL::Object& b_parent, object_updated = true; } - /* visibility flags for both parent and child */ - uint visibility = object_ray_visibility(b_ob) & PATH_RAY_ALL_VISIBILITY; - if(b_parent.ptr.data != b_ob.ptr.data) { - visibility &= object_ray_visibility(b_parent); - } - - /* make holdout objects on excluded layer invisible for non-camera rays */ - if(use_holdout && (layer_flag & render_layer.exclude_layer)) { - visibility &= ~(PATH_RAY_ALL_VISIBILITY - PATH_RAY_CAMERA); - } - - /* hide objects not on render layer from camera rays */ - if(!(layer_flag & render_layer.layer)) { - visibility &= ~PATH_RAY_CAMERA; - } - if(visibility != object->visibility) { object->visibility = visibility; object_updated = true; diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 54973fd1b7f..e268c9a0d35 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -60,6 +60,8 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene) /* Backup some settings for comparison. */ DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type; DebugFlags::OpenCL::KernelType opencl_kernel_type = flags.opencl.kernel_type; + /* Synchronize shared flags. */ + flags.viewport_static_bvh = get_enum(cscene, "debug_bvh_type"); /* Synchronize CPU flags. */ flags.cpu.avx2 = get_boolean(cscene, "debug_use_cpu_avx2"); flags.cpu.avx = get_boolean(cscene, "debug_use_cpu_avx"); diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index 2b5dd5eadea..12de3da063f 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -129,9 +129,9 @@ void BlenderSession::create_session() scene = new Scene(scene_params, session_params.device); /* setup callbacks for builtin image support */ - scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6, _7); - scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4); - scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4); + scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6, _7, _8); + scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4, _5); + scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4, _5); /* create session */ session = new Session(session_params); @@ -1013,7 +1013,8 @@ void BlenderSession::builtin_image_info(const string &builtin_name, int &width, int &height, int &depth, - int &channels) + int &channels, + bool& free_cache) { /* empty image */ is_float = false; @@ -1021,6 +1022,7 @@ void BlenderSession::builtin_image_info(const string &builtin_name, height = 1; depth = 0; channels = 0; + free_cache = false; if(!builtin_data) return; @@ -1034,6 +1036,7 @@ void BlenderSession::builtin_image_info(const string &builtin_name, /* image data */ BL::Image b_image(b_id); + free_cache = !b_image.has_data(); is_float = b_image.is_float(); width = b_image.size()[0]; height = b_image.size()[1]; @@ -1094,7 +1097,8 @@ void BlenderSession::builtin_image_info(const string &builtin_name, bool BlenderSession::builtin_image_pixels(const string &builtin_name, void *builtin_data, unsigned char *pixels, - const size_t pixels_size) + const size_t pixels_size, + const bool free_cache) { if(!builtin_data) { return false; @@ -1115,7 +1119,6 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name, if(image_pixels && num_pixels * channels == pixels_size) { memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char)); - MEM_freeN(image_pixels); } else { if(channels == 1) { @@ -1134,6 +1137,16 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name, } } } + + if(image_pixels) { + MEM_freeN(image_pixels); + } + + /* Free image buffers to save memory during render. */ + if(free_cache) { + b_image.buffers_free(); + } + /* Premultiply, byte images are always straight for Blender. */ unsigned char *cp = pixels; for(size_t i = 0; i < num_pixels; i++, cp += channels) { @@ -1147,7 +1160,8 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name, bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void *builtin_data, float *pixels, - const size_t pixels_size) + const size_t pixels_size, + const bool free_cache) { if(!builtin_data) { return false; @@ -1172,7 +1186,6 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, if(image_pixels && num_pixels * channels == pixels_size) { memcpy(pixels, image_pixels, pixels_size * sizeof(float)); - MEM_freeN(image_pixels); } else { if(channels == 1) { @@ -1192,6 +1205,15 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, } } + if(image_pixels) { + MEM_freeN(image_pixels); + } + + /* Free image buffers to save memory during render. */ + if(free_cache) { + b_image.buffers_free(); + } + return true; } else if(b_id.is_a(&RNA_Object)) { diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h index 536808c5b18..cbd2303d282 100644 --- a/intern/cycles/blender/blender_session.h +++ b/intern/cycles/blender/blender_session.h @@ -156,15 +156,18 @@ protected: int &width, int &height, int &depth, - int &channels); + int &channels, + bool &free_cache); bool builtin_image_pixels(const string &builtin_name, void *builtin_data, unsigned char *pixels, - const size_t pixels_size); + const size_t pixels_size, + const bool free_cache); bool builtin_image_float_pixels(const string &builtin_name, void *builtin_data, float *pixels, - const size_t pixels_size); + const size_t pixels_size, + const bool free_cache); /* Update tile manager to reflect resumable render settings. */ void update_resumable_tile_manager(int num_samples); diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 3a00384458a..42e3721883f 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -234,7 +234,6 @@ void BlenderSync::sync_integrator() Integrator *integrator = scene->integrator; Integrator previntegrator = *integrator; - integrator->min_bounce = get_int(cscene, "min_bounces"); integrator->max_bounce = get_int(cscene, "max_bounces"); integrator->max_diffuse_bounce = get_int(cscene, "diffuse_bounces"); @@ -243,8 +242,6 @@ void BlenderSync::sync_integrator() integrator->max_volume_bounce = get_int(cscene, "volume_bounces"); integrator->transparent_max_bounce = get_int(cscene, "transparent_max_bounces"); - integrator->transparent_min_bounce = get_int(cscene, "transparent_min_bounces"); - integrator->transparent_shadows = get_boolean(cscene, "use_transparent_shadows"); integrator->volume_max_steps = get_int(cscene, "volume_max_steps"); integrator->volume_step_size = get_float(cscene, "volume_step_size"); @@ -629,14 +626,10 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene, else if(shadingsystem == 1) params.shadingsystem = SHADINGSYSTEM_OSL; - if(background) + if(background || DebugFlags().viewport_static_bvh) params.bvh_type = SceneParams::BVH_STATIC; else - params.bvh_type = (SceneParams::BVHType)get_enum( - cscene, - "debug_bvh_type", - SceneParams::BVH_NUM_TYPES, - SceneParams::BVH_STATIC); + params.bvh_type = SceneParams::BVH_DYNAMIC; params.use_bvh_spatial_split = RNA_boolean_get(&cscene, "debug_use_spatial_splits"); params.use_bvh_unaligned_nodes = RNA_boolean_get(&cscene, "debug_use_hair_bvh"); @@ -810,6 +803,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine, } params.start_resolution = get_int(cscene, "preview_start_resolution"); + params.pixel_size = b_engine.get_preview_pixel_size(b_scene); /* other parameters */ if(b_scene.render().threads_mode() == BL::RenderSettings::threads_mode_FIXED) @@ -830,6 +824,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine, params.progressive = false; params.start_resolution = INT_MAX; + params.pixel_size = 1; } else params.progressive = true; diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h index ebbf325f95b..363e19f7a20 100644 --- a/intern/cycles/blender/blender_util.h +++ b/intern/cycles/blender/blender_util.h @@ -51,8 +51,8 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data, bool calc_undeformed, Mesh::SubdivisionType subdivision_type) { - bool subsurf_mod_show_render; - bool subsurf_mod_show_viewport; + bool subsurf_mod_show_render = false; + bool subsurf_mod_show_viewport = false; if(subdivision_type != Mesh::SUBDIVISION_NONE) { BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length()-1]; diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index 33143e2d8aa..0ad3c8a7429 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -153,7 +153,6 @@ void BVH::pack_primitives() if(pack.prim_index[i] != -1) { int tob = pack.prim_object[i]; Object *ob = objects[tob]; - if((pack.prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) { pack_triangle(i, (float4*)&pack.prim_tri_verts[3 * prim_triangle_index]); pack.prim_tri_index[i] = 3 * prim_triangle_index; @@ -162,11 +161,10 @@ void BVH::pack_primitives() else { pack.prim_tri_index[i] = -1; } - - pack.prim_visibility[i] = ob->visibility; - - if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE) + pack.prim_visibility[i] = ob->visibility_for_tracing(); + if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE) { pack.prim_visibility[i] |= PATH_RAY_CURVE; + } } else { pack.prim_tri_index[i] = -1; diff --git a/intern/cycles/bvh/bvh2.cpp b/intern/cycles/bvh/bvh2.cpp index 340ba7dcf53..9aa8e71dfd0 100644 --- a/intern/cycles/bvh/bvh2.cpp +++ b/intern/cycles/bvh/bvh2.cpp @@ -312,10 +312,8 @@ void BVH2::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) } } } - - visibility |= ob->visibility; + visibility |= ob->visibility_for_tracing(); } - /* TODO(sergey): De-duplicate with pack_leaf(). */ float4 leaf_data[BVH_NODE_LEAF_SIZE]; leaf_data[0].x = __int_as_float(c0); diff --git a/intern/cycles/bvh/bvh4.cpp b/intern/cycles/bvh/bvh4.cpp index 5034ab811d5..777de20423b 100644 --- a/intern/cycles/bvh/bvh4.cpp +++ b/intern/cycles/bvh/bvh4.cpp @@ -242,21 +242,21 @@ void BVH4::pack_unaligned_node(int idx, * so kernel might safely assume there are always 4 child nodes. */ - data[1][i] = 1.0f; - data[2][i] = 0.0f; - data[3][i] = 0.0f; + data[1][i] = NAN; + data[2][i] = NAN; + data[3][i] = NAN; - data[4][i] = 0.0f; - data[5][i] = 0.0f; - data[6][i] = 0.0f; + data[4][i] = NAN; + data[5][i] = NAN; + data[6][i] = NAN; - data[7][i] = 0.0f; - data[8][i] = 0.0f; - data[9][i] = 0.0f; + data[7][i] = NAN; + data[8][i] = NAN; + data[9][i] = NAN; - data[10][i] = -FLT_MAX; - data[11][i] = -FLT_MAX; - data[12][i] = -FLT_MAX; + data[10][i] = NAN; + data[11][i] = NAN; + data[12][i] = NAN; data[13][i] = __int_as_float(0); } @@ -438,10 +438,8 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) } } } - - visibility |= ob->visibility; + visibility |= ob->visibility_for_tracing(); } - /* TODO(sergey): This is actually a copy of pack_leaf(), * but this chunk of code only knows actual data and has * no idea about BVHNode. diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp index 1880964355c..649ce52da05 100644 --- a/intern/cycles/bvh/bvh_build.cpp +++ b/intern/cycles/bvh/bvh_build.cpp @@ -529,7 +529,9 @@ BVHNode* BVHBuild::run() << " Allocation slop factor: " << ((prim_type.capacity() != 0) ? (float)prim_type.size() / prim_type.capacity() - : 1.0f) << "\n"; + : 1.0f) << "\n" + << " Maximum depth: " + << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_DEPTH)) << "\n"; } } @@ -671,7 +673,7 @@ BVHNode* BVHBuild::build_node(const BVHObjectBinning& range, int level) return create_leaf_node(range, references); } } - /* Check whether unaligned split is better than the regulat one. */ + /* Check whether unaligned split is better than the regular one. */ if(unalignedSplitSAH < splitSAH) { do_unalinged_split = true; } @@ -865,7 +867,7 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start, prim_time[start] = make_float2(ref->time_from(), ref->time_to()); } - uint visibility = objects[ref->prim_object()]->visibility; + const uint visibility = objects[ref->prim_object()]->visibility_for_tracing(); BVHNode *leaf_node = new LeafNode(ref->bounds(), visibility, start, start+1); leaf_node->time_from = ref->time_from(); leaf_node->time_to = ref->time_to(); @@ -939,7 +941,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, ref.time_to())); bounds[type_index].grow(ref.bounds()); - visibility[type_index] |= objects[ref.prim_object()]->visibility; + visibility[type_index] |= objects[ref.prim_object()]->visibility_for_tracing(); if(ref.prim_type() & PRIMITIVE_ALL_CURVE) { visibility[type_index] |= PATH_RAY_CURVE; } @@ -1040,7 +1042,6 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, */ start_index = spatial_free_index; spatial_free_index += range.size(); - /* Extend an array when needed. */ const size_t range_end = start_index + range.size(); if(prim_type.size() < range_end) { @@ -1066,8 +1067,6 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, prim_time.resize(range_end); } } - spatial_spin_lock.unlock(); - /* Perform actual data copy. */ if(new_leaf_data_size > 0) { memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size); @@ -1077,6 +1076,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data); } } + spatial_spin_lock.unlock(); } else { /* For the regular BVH builder we simply copy new data starting at the diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp index 4237c62ab5b..ab6df4d265d 100644 --- a/intern/cycles/bvh/bvh_node.cpp +++ b/intern/cycles/bvh/bvh_node.cpp @@ -132,6 +132,17 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const case BVH_STAT_UNALIGNED_LEAF_COUNT: cnt = (is_leaf() && is_unaligned) ? 1 : 0; break; + case BVH_STAT_DEPTH: + if(is_leaf()) { + cnt = 1; + } + else { + for(int i = 0; i < num_children(); i++) { + cnt = max(cnt, get_child(i)->getSubtreeSize(stat)); + } + cnt += 1; + } + return cnt; default: assert(0); /* unknown mode */ } diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h index 1c875f5a524..94cf5ab730c 100644 --- a/intern/cycles/bvh/bvh_node.h +++ b/intern/cycles/bvh/bvh_node.h @@ -38,6 +38,7 @@ enum BVH_STAT { BVH_STAT_UNALIGNED_INNER_QNODE_COUNT, BVH_STAT_ALIGNED_LEAF_COUNT, BVH_STAT_UNALIGNED_LEAF_COUNT, + BVH_STAT_DEPTH, }; class BVHParams; diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 74ec57ddf74..3c632160fbd 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -34,11 +34,13 @@ set(SRC set(SRC_OPENCL opencl/opencl.h + opencl/memory_manager.h opencl/opencl_base.cpp opencl/opencl_mega.cpp opencl/opencl_split.cpp opencl/opencl_util.cpp + opencl/memory_manager.cpp ) if(WITH_CYCLES_NETWORK) diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index a54bb77f9f3..f64436aec7b 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -379,11 +379,9 @@ DeviceInfo Device::get_multi_device(vector<DeviceInfo> subdevices) info.num = 0; info.has_bindless_textures = true; - info.pack_images = false; foreach(DeviceInfo &device, subdevices) { assert(device.type == info.multi_devices[0].type); - info.pack_images |= device.pack_images; info.has_bindless_textures &= device.has_bindless_textures; } diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index b3b693c630c..26d6d380a10 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -53,7 +53,6 @@ public: int num; bool display_device; bool advanced_shading; - bool pack_images; bool has_bindless_textures; /* flag for GPU and Multi device */ bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */ vector<DeviceInfo> multi_devices; @@ -65,7 +64,6 @@ public: num = 0; display_device = false; advanced_shading = true; - pack_images = false; has_bindless_textures = false; use_split_kernel = false; } diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 18112437b45..6a1106328fb 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -48,6 +48,7 @@ #include "util/util_logging.h" #include "util/util_map.h" #include "util/util_opengl.h" +#include "util/util_optimization.h" #include "util/util_progress.h" #include "util/util_system.h" #include "util/util_thread.h" @@ -119,7 +120,7 @@ public: } #endif - if(strstr(architecture_name, logged_architecture) != 0) { + if(strcmp(architecture_name, logged_architecture) != 0) { VLOG(1) << "Will be using " << architecture_name << " kernels."; logged_architecture = architecture_name; } @@ -976,7 +977,6 @@ void device_cpu_info(vector<DeviceInfo>& devices) info.id = "CPU"; info.num = 0; info.advanced_shading = true; - info.pack_images = false; devices.insert(devices.begin(), info); } diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 3a29538aa13..29b5bd70789 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -111,6 +111,16 @@ public: virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); }; +/* Utility to push/pop CUDA context. */ +class CUDAContextScope { +public: + CUDAContextScope(CUDADevice *device); + ~CUDAContextScope(); + +private: + CUDADevice *device; +}; + class CUDADevice : public Device { public: @@ -206,16 +216,6 @@ public: cuda_error_documentation(); } - void cuda_push_context() - { - cuda_assert(cuCtxSetCurrent(cuContext)); - } - - void cuda_pop_context() - { - cuda_assert(cuCtxSetCurrent(NULL)); - } - CUDADevice(DeviceInfo& info, Stats &stats, bool background_) : Device(info, stats, background_) { @@ -263,7 +263,8 @@ public: cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); cuDevArchitecture = major*100 + minor*10; - cuda_pop_context(); + /* Pop context set by cuCtxCreate. */ + cuCtxPopCurrent(NULL); } ~CUDADevice() @@ -519,7 +520,7 @@ public: return false; /* open module */ - cuda_push_context(); + CUDAContextScope scope(this); string cubin_data; CUresult result; @@ -540,8 +541,6 @@ public: if(cuda_error_(result, "cuModuleLoad")) cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); - cuda_pop_context(); - return (result == CUDA_SUCCESS); } @@ -556,36 +555,36 @@ public: void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/) { + CUDAContextScope scope(this); + if(name) { VLOG(1) << "Buffer allocate: " << name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; } - cuda_push_context(); CUdeviceptr device_pointer; size_t size = mem.memory_size(); cuda_assert(cuMemAlloc(&device_pointer, size)); mem.device_pointer = (device_ptr)device_pointer; mem.device_size = size; stats.mem_alloc(size); - cuda_pop_context(); } void mem_copy_to(device_memory& mem) { - cuda_push_context(); + CUDAContextScope scope(this); + if(mem.device_pointer) cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size())); - cuda_pop_context(); } void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) { + CUDAContextScope scope(this); size_t offset = elem*y*w; size_t size = elem*w*h; - cuda_push_context(); if(mem.device_pointer) { cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset, (CUdeviceptr)(mem.device_pointer + offset), size)); @@ -593,7 +592,6 @@ public: else { memset((char*)mem.data_pointer + offset, 0, size); } - cuda_pop_context(); } void mem_zero(device_memory& mem) @@ -602,18 +600,17 @@ public: memset((void*)mem.data_pointer, 0, mem.memory_size()); } - cuda_push_context(); - if(mem.device_pointer) + if(mem.device_pointer) { + CUDAContextScope scope(this); cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())); - cuda_pop_context(); + } } void mem_free(device_memory& mem) { if(mem.device_pointer) { - cuda_push_context(); + CUDAContextScope scope(this); cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer))); - cuda_pop_context(); mem.device_pointer = 0; @@ -629,14 +626,13 @@ public: void const_copy_to(const char *name, void *host, size_t size) { + CUDAContextScope scope(this); CUdeviceptr mem; size_t bytes; - cuda_push_context(); cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); //assert(bytes == size); cuda_assert(cuMemcpyHtoD(mem, host, size)); - cuda_pop_context(); } void tex_alloc(const char *name, @@ -644,6 +640,8 @@ public: InterpolationType interpolation, ExtensionType extension) { + CUDAContextScope scope(this); + VLOG(1) << "Texture allocate: " << name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; @@ -706,9 +704,7 @@ public: tokens[3].c_str()); } - cuda_push_context(); cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str())); - cuda_pop_context(); if(!texref) { return; @@ -721,8 +717,6 @@ public: mem_alloc(NULL, mem, MEM_READ_ONLY); mem_copy_to(mem); - cuda_push_context(); - CUdeviceptr cumem; size_t cubytes; @@ -738,28 +732,20 @@ public: uint32_t ptr = (uint32_t)mem.device_pointer; cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); } - - cuda_pop_context(); } else { mem_alloc(NULL, mem, MEM_READ_ONLY); mem_copy_to(mem); - cuda_push_context(); - cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size)); cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)); cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER)); - - cuda_pop_context(); } } /* Texture Storage */ else { CUarray handle = NULL; - cuda_push_context(); - if(mem.data_depth > 1) { CUDA_ARRAY3D_DESCRIPTOR desc; @@ -784,7 +770,6 @@ public: } if(!handle) { - cuda_pop_context(); return; } @@ -877,14 +862,10 @@ public: cuda_assert(cuTexRefSetFilterMode(texref, filter_mode)); cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)); } - - cuda_pop_context(); } /* Fermi, Data and Image Textures */ if(!has_bindless_textures) { - cuda_push_context(); - cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode)); cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode)); if(mem.data_depth > 1) { @@ -892,8 +873,6 @@ public: } cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)); - - cuda_pop_context(); } /* Fermi and Kepler */ @@ -904,9 +883,8 @@ public: { if(mem.device_pointer) { if(tex_interp_map[mem.device_pointer]) { - cuda_push_context(); + CUDAContextScope scope(this); cuArrayDestroy((CUarray)mem.device_pointer); - cuda_pop_context(); /* Free CUtexObject (Bindless Textures) */ if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) { @@ -960,7 +938,7 @@ public: if(have_error()) return false; - cuda_push_context(); + CUDAContextScope scope(this); int4 rect = task->rect; int w = align_up(rect.z-rect.x, 4); @@ -1017,7 +995,6 @@ public: CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); return !have_error(); } @@ -1026,7 +1003,7 @@ public: if(have_error()) return false; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuFilterConstructTransform; cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); @@ -1046,7 +1023,6 @@ public: CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); return !have_error(); } @@ -1058,11 +1034,11 @@ public: if(have_error()) return false; + CUDAContextScope scope(this); + mem_zero(task->storage.XtWX); mem_zero(task->storage.XtWY); - cuda_push_context(); - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize; cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); @@ -1150,7 +1126,6 @@ public: CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); return !have_error(); } @@ -1161,7 +1136,7 @@ public: if(have_error()) return false; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuFilterCombineHalves; cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); @@ -1179,7 +1154,6 @@ public: CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); return !have_error(); } @@ -1190,7 +1164,7 @@ public: if(have_error()) return false; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuFilterDivideShadow; cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); @@ -1214,7 +1188,6 @@ public: CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); return !have_error(); } @@ -1227,7 +1200,7 @@ public: if(have_error()) return false; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuFilterGetFeature; cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); @@ -1250,7 +1223,6 @@ public: CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); return !have_error(); } @@ -1263,7 +1235,7 @@ public: if(have_error()) return false; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuFilterDetectOutliers; cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers")); @@ -1282,7 +1254,6 @@ public: CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); return !have_error(); } @@ -1319,7 +1290,7 @@ public: if(have_error()) return; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuPathTrace; CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer); @@ -1333,8 +1304,9 @@ public: cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); } - if(have_error()) + if(have_error()) { return; + } /* pass in parameters */ void *args[] = {&d_buffer, @@ -1370,8 +1342,6 @@ public: 0, 0, args, 0)); cuda_assert(cuCtxSynchronize()); - - cuda_pop_context(); } void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) @@ -1379,7 +1349,7 @@ public: if(have_error()) return; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuFilmConvert; CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half); @@ -1424,8 +1394,6 @@ public: 0, 0, args, 0)); unmap_pixels((rgba_byte)? rgba_byte: rgba_half); - - cuda_pop_context(); } void shader(DeviceTask& task) @@ -1433,7 +1401,7 @@ public: if(have_error()) return; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuShader; CUdeviceptr d_input = cuda_device_ptr(task.shader_input); @@ -1498,8 +1466,6 @@ public: task.update_progress(NULL); } - - cuda_pop_context(); } CUdeviceptr map_pixels(device_ptr mem) @@ -1535,7 +1501,7 @@ public: pmem.w = mem.data_width; pmem.h = mem.data_height; - cuda_push_context(); + CUDAContextScope scope(this); glGenBuffers(1, &pmem.cuPBO); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); @@ -1559,8 +1525,6 @@ public: CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); if(result == CUDA_SUCCESS) { - cuda_pop_context(); - mem.device_pointer = pmem.cuTexId; pixel_mem_map[mem.device_pointer] = pmem; @@ -1574,8 +1538,6 @@ public: glDeleteBuffers(1, &pmem.cuPBO); glDeleteTextures(1, &pmem.cuTexId); - cuda_pop_context(); - background = true; } } @@ -1588,7 +1550,7 @@ public: if(!background) { PixelMem pmem = pixel_mem_map[mem.device_pointer]; - cuda_push_context(); + CUDAContextScope scope(this); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY); @@ -1597,8 +1559,6 @@ public: glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - cuda_pop_context(); - return; } @@ -1611,14 +1571,12 @@ public: if(!background) { PixelMem pmem = pixel_mem_map[mem.device_pointer]; - cuda_push_context(); + CUDAContextScope scope(this); cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)); glDeleteBuffers(1, &pmem.cuPBO); glDeleteTextures(1, &pmem.cuTexId); - cuda_pop_context(); - pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); mem.device_pointer = 0; @@ -1639,7 +1597,7 @@ public: PixelMem pmem = pixel_mem_map[mem.device_pointer]; float *vpointer; - cuda_push_context(); + CUDAContextScope scope(this); /* for multi devices, this assumes the inefficient method that we allocate * all pixels on the device even though we only render to a subset */ @@ -1728,8 +1686,6 @@ public: glBindTexture(GL_TEXTURE_2D, 0); glDisable(GL_TEXTURE_2D); - cuda_pop_context(); - return; } @@ -1738,6 +1694,8 @@ public: void thread_run(DeviceTask *task) { + CUDAContextScope scope(this); + if(task->type == DeviceTask::RENDER) { RenderTile tile; @@ -1805,9 +1763,7 @@ public: shader(*task); - cuda_push_context(); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); } } @@ -1828,12 +1784,11 @@ public: void task_add(DeviceTask& task) { if(task.type == DeviceTask::FILM_CONVERT) { + CUDAContextScope scope(this); + /* must be done in main thread due to opengl access */ film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); - - cuda_push_context(); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); } else { task_pool.push(new CUDADeviceTask(this, task)); @@ -1852,6 +1807,7 @@ public: friend class CUDASplitKernelFunction; friend class CUDASplitKernel; + friend class CUDAContextScope; }; /* redefine the cuda_assert macro so it can be used outside of the CUDADevice class @@ -1872,6 +1828,20 @@ public: } \ } (void)0 + +/* CUDA context scope. */ + +CUDAContextScope::CUDAContextScope(CUDADevice *device) +: device(device) +{ + cuda_assert(cuCtxPushCurrent(device->cuContext)); +} + +CUDAContextScope::~CUDAContextScope() +{ + cuda_assert(cuCtxPopCurrent(NULL)); +} + /* split kernel */ class CUDASplitKernelFunction : public SplitKernelFunction{ @@ -1889,30 +1859,24 @@ public: /* enqueue the kernel, returns false if there is an error */ bool enqueue(const KernelDimensions &dim, void *args[]) { - device->cuda_push_context(); - if(device->have_error()) return false; + CUDAContextScope scope(device); + /* we ignore dim.local_size for now, as this is faster */ int threads_per_block; cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); - int xthreads = (int)sqrt(threads_per_block); - int ythreads = (int)sqrt(threads_per_block); - - int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads; - int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads; + int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block; cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); cuda_assert(cuLaunchKernel(func, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ + xblocks, 1, 1, /* blocks */ + threads_per_block, 1, 1, /* threads */ 0, 0, args, 0)); - device->cuda_pop_context(); - return !device->have_error(); } }; @@ -1923,12 +1887,12 @@ CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device) uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads) { + CUDAContextScope scope(device); + device_vector<uint64_t> size_buffer; size_buffer.resize(1); device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE); - device->cuda_push_context(); - uint threads = num_threads; CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer); @@ -1950,8 +1914,6 @@ uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory 1, 1, 1, 0, 0, (void**)&args, 0)); - device->cuda_pop_context(); - device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t)); device->mem_free(size_buffer); @@ -1969,7 +1931,7 @@ bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim device_memory& use_queues_flag, device_memory& work_pool_wgs) { - device->cuda_push_context(); + CUDAContextScope scope(device); CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer); CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer); @@ -2033,26 +1995,21 @@ bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args); - device->cuda_pop_context(); - return !device->have_error(); } SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name, const DeviceRequestedFeatures&) { + CUDAContextScope scope(device); CUfunction func; - device->cuda_push_context(); - cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data())); if(device->have_error()) { device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data())); return NULL; } - device->cuda_pop_context(); - return new CUDASplitKernelFunction(device, func); } @@ -2063,12 +2020,11 @@ int2 CUDASplitKernel::split_kernel_local_size() int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) { + CUDAContextScope scope(device); size_t free; size_t total; - device->cuda_push_context(); cuda_assert(cuMemGetInfo(&free, &total)); - device->cuda_pop_context(); VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free) << " bytes. (" @@ -2127,18 +2083,34 @@ Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background) return new CUDADevice(info, stats, background); } -void device_cuda_info(vector<DeviceInfo>& devices) +static CUresult device_cuda_safe_init() { - CUresult result; - int count = 0; +#ifdef _WIN32 + __try { + return cuInit(0); + } + __except(EXCEPTION_EXECUTE_HANDLER) { + /* Ignore crashes inside the CUDA driver and hope we can + * survive even with corrupted CUDA installs. */ + fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n"); + } - result = cuInit(0); + return CUDA_ERROR_NO_DEVICE; +#else + return cuInit(0); +#endif +} + +void device_cuda_info(vector<DeviceInfo>& devices) +{ + CUresult result = device_cuda_safe_init(); if(result != CUDA_SUCCESS) { if(result != CUDA_ERROR_NO_DEVICE) fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result)); return; } + int count = 0; result = cuDeviceGetCount(&count); if(result != CUDA_SUCCESS) { fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result)); @@ -2168,7 +2140,6 @@ void device_cuda_info(vector<DeviceInfo>& devices) info.advanced_shading = (major >= 2); info.has_bindless_textures = (major >= 3); - info.pack_images = false; int pci_location[3] = {0, 0, 0}; cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num); @@ -2196,7 +2167,7 @@ void device_cuda_info(vector<DeviceInfo>& devices) string device_cuda_capabilities(void) { - CUresult result = cuInit(0); + CUresult result = device_cuda_safe_init(); if(result != CUDA_SUCCESS) { if(result != CUDA_ERROR_NO_DEVICE) { return string("Error initializing CUDA: ") + cuewErrorString(result); diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp index 66758954f44..571ba9465ca 100644 --- a/intern/cycles/device/device_network.cpp +++ b/intern/cycles/device/device_network.cpp @@ -344,7 +344,6 @@ void device_network_info(vector<DeviceInfo>& devices) info.id = "NETWORK"; info.num = 0; info.advanced_shading = true; /* todo: get this info from device */ - info.pack_images = false; devices.push_back(info); } diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index 681b8214b03..9d89decaaaf 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -73,8 +73,34 @@ bool device_opencl_init(void) return result; } + +static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms) +{ +#ifdef _WIN32 + __try { + return clGetPlatformIDs(0, NULL, num_platforms); + } + __except(EXCEPTION_EXECUTE_HANDLER) { + /* Ignore crashes inside the OpenCL driver and hope we can + * survive even with corrupted OpenCL installs. */ + fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n"); + } + + *num_platforms = 0; + return CL_DEVICE_NOT_FOUND; +#else + return clGetPlatformIDs(0, NULL, num_platforms); +#endif +} + void device_opencl_info(vector<DeviceInfo>& devices) { + cl_uint num_platforms = 0; + device_opencl_get_num_platforms_safe(&num_platforms); + if(num_platforms == 0) { + return; + } + vector<OpenCLPlatformDevice> usable_devices; OpenCLInfo::get_usable_devices(&usable_devices); /* Devices are numbered consecutively across platforms. */ @@ -95,7 +121,6 @@ void device_opencl_info(vector<DeviceInfo>& devices) /* We don't know if it's used for display, but assume it is. */ info.display_device = true; info.advanced_shading = OpenCLInfo::kernel_use_advanced_shading(platform_name); - info.pack_images = true; info.use_split_kernel = OpenCLInfo::kernel_use_split(platform_name, device_type); info.id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id; @@ -114,7 +139,7 @@ string device_opencl_capabilities(void) * it could also be nicely reported to the console. */ cl_uint num_platforms = 0; - opencl_assert(clGetPlatformIDs(0, NULL, &num_platforms)); + opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms)); if(num_platforms == 0) { return "No OpenCL platforms found\n"; } diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp new file mode 100644 index 00000000000..b67dfef88aa --- /dev/null +++ b/intern/cycles/device/opencl/memory_manager.cpp @@ -0,0 +1,253 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_OPENCL + +#include "util/util_foreach.h" + +#include "device/opencl/opencl.h" +#include "device/opencl/memory_manager.h" + +CCL_NAMESPACE_BEGIN + +void MemoryManager::DeviceBuffer::add_allocation(Allocation& allocation) +{ + allocations.push_back(&allocation); +} + +void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDeviceBase *device) +{ + bool need_realloc = false; + + /* Calculate total size and remove any freed. */ + size_t total_size = 0; + + for(int i = allocations.size()-1; i >= 0; i--) { + Allocation* allocation = allocations[i]; + + /* Remove allocations that have been freed. */ + if(!allocation->mem || allocation->mem->memory_size() == 0) { + allocation->device_buffer = NULL; + allocation->size = 0; + + allocations.erase(allocations.begin()+i); + + need_realloc = true; + + continue; + } + + /* Get actual size for allocation. */ + size_t alloc_size = align_up(allocation->mem->memory_size(), 16); + + if(allocation->size != alloc_size) { + /* Allocation is either new or resized. */ + allocation->size = alloc_size; + allocation->needs_copy_to_device = true; + + need_realloc = true; + } + + total_size += alloc_size; + } + + if(need_realloc) { + cl_ulong max_buffer_size; + clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); + + if(total_size > max_buffer_size) { + device->set_error("Scene too complex to fit in available memory."); + return; + } + + device_memory *new_buffer = new device_memory; + + new_buffer->resize(total_size); + device->mem_alloc(string_printf("buffer_%p", this).data(), *new_buffer, MEM_READ_ONLY); + + size_t offset = 0; + + foreach(Allocation* allocation, allocations) { + if(allocation->needs_copy_to_device) { + /* Copy from host to device. */ + opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue, + CL_MEM_PTR(new_buffer->device_pointer), + CL_FALSE, + offset, + allocation->mem->memory_size(), + (void*)allocation->mem->data_pointer, + 0, NULL, NULL + )); + + allocation->needs_copy_to_device = false; + } + else { + /* Fast copy from memory already on device. */ + opencl_device_assert(device, clEnqueueCopyBuffer(device->cqCommandQueue, + CL_MEM_PTR(buffer->device_pointer), + CL_MEM_PTR(new_buffer->device_pointer), + allocation->desc.offset, + offset, + allocation->mem->memory_size(), + 0, NULL, NULL + )); + } + + allocation->desc.offset = offset; + offset += allocation->size; + } + + device->mem_free(*buffer); + delete buffer; + + buffer = new_buffer; + } + else { + assert(total_size == buffer->data_size); + + size_t offset = 0; + + foreach(Allocation* allocation, allocations) { + if(allocation->needs_copy_to_device) { + /* Copy from host to device. */ + opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue, + CL_MEM_PTR(buffer->device_pointer), + CL_FALSE, + offset, + allocation->mem->memory_size(), + (void*)allocation->mem->data_pointer, + 0, NULL, NULL + )); + + allocation->needs_copy_to_device = false; + } + + offset += allocation->size; + } + } + + /* Not really necessary, but seems to improve responsiveness for some reason. */ + clFinish(device->cqCommandQueue); +} + +void MemoryManager::DeviceBuffer::free(OpenCLDeviceBase *device) +{ + device->mem_free(*buffer); +} + +MemoryManager::DeviceBuffer* MemoryManager::smallest_device_buffer() +{ + DeviceBuffer* smallest = device_buffers; + + foreach(DeviceBuffer& device_buffer, device_buffers) { + if(device_buffer.size < smallest->size) { + smallest = &device_buffer; + } + } + + return smallest; +} + +MemoryManager::MemoryManager(OpenCLDeviceBase *device) : device(device), need_update(false) +{ +} + +void MemoryManager::free() +{ + foreach(DeviceBuffer& device_buffer, device_buffers) { + device_buffer.free(device); + } +} + +void MemoryManager::alloc(const char *name, device_memory& mem) +{ + Allocation& allocation = allocations[name]; + + allocation.mem = &mem; + allocation.needs_copy_to_device = true; + + if(!allocation.device_buffer) { + DeviceBuffer* device_buffer = smallest_device_buffer(); + allocation.device_buffer = device_buffer; + + allocation.desc.device_buffer = device_buffer - device_buffers; + + device_buffer->add_allocation(allocation); + + device_buffer->size += mem.memory_size(); + } + + need_update = true; +} + +bool MemoryManager::free(device_memory& mem) +{ + foreach(AllocationsMap::value_type& value, allocations) { + Allocation& allocation = value.second; + if(allocation.mem == &mem) { + + allocation.device_buffer->size -= mem.memory_size(); + + allocation.mem = NULL; + allocation.needs_copy_to_device = false; + + need_update = true; + return true; + } + } + + return false; +} + +MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name) +{ + update_device_memory(); + + Allocation& allocation = allocations[name]; + return allocation.desc; +} + +void MemoryManager::update_device_memory() +{ + if(!need_update) { + return; + } + + need_update = false; + + foreach(DeviceBuffer& device_buffer, device_buffers) { + device_buffer.update_device_memory(device); + } +} + +void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) +{ + update_device_memory(); + + foreach(DeviceBuffer& device_buffer, device_buffers) { + if(device_buffer.buffer->device_pointer) { + device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer); + } + else { + device->kernel_set_args(kernel, (*narg)++, device->null_mem); + } + } +} + +CCL_NAMESPACE_END + +#endif /* WITH_OPENCL */ + diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h new file mode 100644 index 00000000000..3714405d026 --- /dev/null +++ b/intern/cycles/device/opencl/memory_manager.h @@ -0,0 +1,105 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "device/device.h" + +#include "util/util_map.h" +#include "util/util_vector.h" +#include "util/util_string.h" + +#include "clew.h" + +CCL_NAMESPACE_BEGIN + +class OpenCLDeviceBase; + +class MemoryManager { +public: + static const int NUM_DEVICE_BUFFERS = 8; + + struct BufferDescriptor { + uint device_buffer; + cl_ulong offset; + }; + +private: + struct DeviceBuffer; + + struct Allocation { + device_memory *mem; + + DeviceBuffer *device_buffer; + size_t size; /* Size of actual allocation, may be larger than requested. */ + + BufferDescriptor desc; + + bool needs_copy_to_device; + + Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false) + { + } + }; + + struct DeviceBuffer { + device_memory *buffer; + vector<Allocation*> allocations; + size_t size; /* Size of all allocations. */ + + DeviceBuffer() : buffer(new device_memory), size(0) + { + } + + ~DeviceBuffer() { + delete buffer; + buffer = NULL; + } + + void add_allocation(Allocation& allocation); + + void update_device_memory(OpenCLDeviceBase *device); + + void free(OpenCLDeviceBase *device); + }; + + OpenCLDeviceBase *device; + + DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS]; + + typedef unordered_map<string, Allocation> AllocationsMap; + AllocationsMap allocations; + + bool need_update; + + DeviceBuffer* smallest_device_buffer(); + +public: + MemoryManager(OpenCLDeviceBase *device); + + void free(); /* Free all memory. */ + + void alloc(const char *name, device_memory& mem); + bool free(device_memory& mem); + + BufferDescriptor get_descriptor(string name); + + void update_device_memory(); + void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); +}; + +CCL_NAMESPACE_END + diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 78ca377d933..26bf4a9af5b 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -25,6 +25,8 @@ #include "clew.h" +#include "device/opencl/memory_manager.h" + CCL_NAMESPACE_BEGIN /* Disable workarounds, seems to be working fine on latest drivers. */ @@ -224,6 +226,18 @@ public: static string get_kernel_md5(); }; +#define opencl_device_assert(device, stmt) \ + { \ + cl_int err = stmt; \ + \ + if(err != CL_SUCCESS) { \ + string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ + if((device)->error_message() == "") \ + (device)->set_error(message); \ + fprintf(stderr, "%s\n", message.c_str()); \ + } \ + } (void)0 + #define opencl_assert(stmt) \ { \ cl_int err = stmt; \ @@ -344,6 +358,7 @@ public: size_t global_size_round_up(int group_size, int global_size); void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1); void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name); + void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half); void shader(DeviceTask& task); @@ -525,6 +540,42 @@ protected: virtual string build_options_for_base_program( const DeviceRequestedFeatures& /*requested_features*/); + +private: + MemoryManager memory_manager; + friend class MemoryManager; + + struct tex_info_t { + uint buffer, padding; + cl_ulong offset; + uint width, height, depth, options; + }; + static_assert_align(tex_info_t, 16); + + vector<tex_info_t> texture_descriptors; + device_memory texture_descriptors_buffer; + + struct Texture { + Texture() {} + Texture(device_memory* mem, + InterpolationType interpolation, + ExtensionType extension) + : mem(mem), + interpolation(interpolation), + extension(extension) { + } + device_memory* mem; + InterpolationType interpolation; + ExtensionType extension; + }; + + typedef map<string, Texture> TexturesMap; + TexturesMap textures; + + bool textures_need_update; + +protected: + void flush_texture_buffers(); }; Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background); diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index 509da7a0a84..7bdf81462b8 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -29,6 +29,15 @@ CCL_NAMESPACE_BEGIN +struct texture_slot_t { + texture_slot_t(const string& name, int slot) + : name(name), + slot(slot) { + } + string name; + int slot; +}; + bool OpenCLDeviceBase::opencl_error(cl_int err) { if(err != CL_SUCCESS) { @@ -63,7 +72,7 @@ void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where) } OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_) -: Device(info, stats, background_) +: Device(info, stats, background_), memory_manager(this) { cpPlatform = NULL; cdDevice = NULL; @@ -71,6 +80,7 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou cqCommandQueue = NULL; null_mem = 0; device_initialized = false; + textures_need_update = true; vector<OpenCLPlatformDevice> usable_devices; OpenCLInfo::get_usable_devices(&usable_devices); @@ -126,6 +136,12 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou return; } + /* Allocate this right away so that texture_descriptors_buffer is placed at offset 0 in the device memory buffers */ + texture_descriptors.resize(1); + texture_descriptors_buffer.resize(1); + texture_descriptors_buffer.data_pointer = (device_ptr)&texture_descriptors[0]; + memory_manager.alloc("texture_descriptors", texture_descriptors_buffer); + fprintf(stderr, "Device init success\n"); device_initialized = true; } @@ -134,6 +150,8 @@ OpenCLDeviceBase::~OpenCLDeviceBase() { task_pool.stop(); + memory_manager.free(); + if(null_mem) clReleaseMemObject(CL_MEM_PTR(null_mem)); @@ -493,29 +511,35 @@ void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size) void OpenCLDeviceBase::tex_alloc(const char *name, device_memory& mem, - InterpolationType /*interpolation*/, - ExtensionType /*extension*/) + InterpolationType interpolation, + ExtensionType extension) { VLOG(1) << "Texture allocate: " << name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; - mem_alloc(NULL, mem, MEM_READ_ONLY); - mem_copy_to(mem); - assert(mem_map.find(name) == mem_map.end()); - mem_map.insert(MemMap::value_type(name, mem.device_pointer)); + + memory_manager.alloc(name, mem); + /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */ + mem.device_pointer = 1; + textures[name] = Texture(&mem, interpolation, extension); + textures_need_update = true; } void OpenCLDeviceBase::tex_free(device_memory& mem) { if(mem.device_pointer) { - foreach(const MemMap::value_type& value, mem_map) { - if(value.second == mem.device_pointer) { - mem_map.erase(value.first); + mem.device_pointer = 0; + + if(memory_manager.free(mem)) { + textures_need_update = true; + } + + foreach(TexturesMap::value_type& value, textures) { + if(value.second.mem == &mem) { + textures.erase(value.first); break; } } - - mem_free(mem); } } @@ -581,6 +605,98 @@ void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr)); } +void OpenCLDeviceBase::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) +{ + flush_texture_buffers(); + + memory_manager.set_kernel_arg_buffers(kernel, narg); +} + +void OpenCLDeviceBase::flush_texture_buffers() +{ + if(!textures_need_update) { + return; + } + textures_need_update = false; + + /* Setup slots for textures. */ + int num_slots = 0; + + vector<texture_slot_t> texture_slots; + +#define KERNEL_TEX(type, ttype, name) \ + if(textures.find(#name) != textures.end()) { \ + texture_slots.push_back(texture_slot_t(#name, num_slots)); \ + } \ + num_slots++; +#include "kernel/kernel_textures.h" + + int num_data_slots = num_slots; + + foreach(TexturesMap::value_type& tex, textures) { + string name = tex.first; + + if(string_startswith(name, "__tex_image")) { + int pos = name.rfind("_"); + int id = atoi(name.data() + pos + 1); + texture_slots.push_back(texture_slot_t(name, + num_data_slots + id)); + num_slots = max(num_slots, num_data_slots + id + 1); + } + } + + /* Realloc texture descriptors buffer. */ + memory_manager.free(texture_descriptors_buffer); + + texture_descriptors.resize(num_slots); + texture_descriptors_buffer.resize(num_slots * sizeof(tex_info_t)); + texture_descriptors_buffer.data_pointer = (device_ptr)&texture_descriptors[0]; + + memory_manager.alloc("texture_descriptors", texture_descriptors_buffer); + + /* Fill in descriptors */ + foreach(texture_slot_t& slot, texture_slots) { + Texture& tex = textures[slot.name]; + + tex_info_t& info = texture_descriptors[slot.slot]; + + MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name); + + info.offset = desc.offset; + info.buffer = desc.device_buffer; + + if(string_startswith(slot.name, "__tex_image")) { + info.width = tex.mem->data_width; + info.height = tex.mem->data_height; + info.depth = tex.mem->data_depth; + + info.options = 0; + + if(tex.interpolation == INTERPOLATION_CLOSEST) { + info.options |= (1 << 0); + } + + switch(tex.extension) { + case EXTENSION_REPEAT: + info.options |= (1 << 1); + break; + case EXTENSION_EXTEND: + info.options |= (1 << 2); + break; + case EXTENSION_CLIP: + info.options |= (1 << 3); + break; + default: + break; + } + } + } + + /* Force write of descriptors. */ + memory_manager.free(texture_descriptors_buffer); + memory_manager.alloc("texture_descriptors", texture_descriptors_buffer); +} + void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) { /* cast arguments to cl types */ @@ -605,10 +721,7 @@ void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ d_rgba, d_buffer); -#define KERNEL_TEX(type, ttype, name) \ -set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name); -#include "kernel/kernel_textures.h" -#undef KERNEL_TEX + set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index); start_arg_index += kernel_set_args(ckFilmConvertKernel, start_arg_index, @@ -1030,10 +1143,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task) d_output_luma); } -#define KERNEL_TEX(type, ttype, name) \ - set_kernel_arg_mem(kernel, &start_arg_index, #name); -#include "kernel/kernel_textures.h" -#undef KERNEL_TEX + set_kernel_arg_buffers(kernel, &start_arg_index); start_arg_index += kernel_set_args(kernel, start_arg_index, diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp index 06c15bcf401..ec47fdafa3d 100644 --- a/intern/cycles/device/opencl/opencl_mega.cpp +++ b/intern/cycles/device/opencl/opencl_mega.cpp @@ -82,10 +82,7 @@ public: d_buffer, d_rng_state); -#define KERNEL_TEX(type, ttype, name) \ - set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name); -#include "kernel/kernel_textures.h" -#undef KERNEL_TEX + set_kernel_arg_buffers(ckPathTraceKernel, &start_arg_index); start_arg_index += kernel_set_args(ckPathTraceKernel, start_arg_index, diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index 76d9983e9a2..16a96213100 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -99,6 +99,8 @@ public: void thread_run(DeviceTask *task) { + flush_texture_buffers(); + if(task->type == DeviceTask::FILM_CONVERT) { film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); } @@ -113,10 +115,19 @@ public: */ typedef struct KernelGlobals { ccl_constant KernelData *data; + ccl_global char *buffers[8]; + + typedef struct _tex_info_t { + uint buffer, padding; + uint64_t offset; + uint width, height, depth, options; + } _tex_info_t; + #define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name; + _tex_info_t name; #include "kernel/kernel_textures.h" #undef KERNEL_TEX + SplitData split_data; SplitParams split_param_data; } KernelGlobals; @@ -217,11 +228,7 @@ public: *cached_memory.ray_state, *cached_memory.rng_state); -/* TODO(sergey): Avoid map lookup here. */ -#define KERNEL_TEX(type, ttype, name) \ - device->set_kernel_arg_mem(program(), &start_arg_index, #name); -#include "kernel/kernel_textures.h" -#undef KERNEL_TEX + device->set_kernel_arg_buffers(program(), &start_arg_index); start_arg_index += device->kernel_set_args(program(), @@ -352,11 +359,7 @@ public: ray_state, rtile.rng_state); -/* TODO(sergey): Avoid map lookup here. */ -#define KERNEL_TEX(type, ttype, name) \ - device->set_kernel_arg_mem(device->program_data_init(), &start_arg_index, #name); -#include "kernel/kernel_textures.h" -#undef KERNEL_TEX + device->set_kernel_arg_buffers(device->program_data_init(), &start_arg_index); start_arg_index += device->kernel_set_args(device->program_data_init(), diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index 0d34af3e040..7d5173a5f1d 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -635,7 +635,7 @@ bool OpenCLInfo::device_supported(const string& platform_name, "Tahiti", "Pitcairn", "Capeverde", "Oland", NULL }; - for (int i = 0; blacklist[i] != NULL; i++) { + for(int i = 0; blacklist[i] != NULL; i++) { if(device_name == blacklist[i]) { VLOG(1) << "AMD device " << device_name << " not supported"; return false; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 23e9bd311c4..b4ca16bdb48 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -79,7 +79,6 @@ set(SRC_HEADERS kernel_compat_cpu.h kernel_compat_cuda.h kernel_compat_opencl.h - kernel_debug.h kernel_differential.h kernel_emission.h kernel_film.h @@ -202,6 +201,7 @@ set(SRC_GEOM_HEADERS geom/geom.h geom/geom_attribute.h geom/geom_curve.h + geom/geom_curve_intersect.h geom/geom_motion_curve.h geom/geom_motion_triangle.h geom/geom_motion_triangle_intersect.h @@ -233,6 +233,7 @@ set(SRC_FILTER_HEADERS set(SRC_UTIL_HEADERS ../util/util_atomic.h ../util/util_color.h + ../util/util_defines.h ../util/util_half.h ../util/util_hash.h ../util/util_math.h diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index 85741016b25..cf0c8542d69 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -233,7 +233,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg, ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, - int skip_object, + uint visibility, uint max_hits, uint *num_hits) { @@ -244,7 +244,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, - skip_object, + visibility, max_hits, num_hits); } @@ -253,7 +253,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, return bvh_intersect_shadow_all_motion(kg, ray, isect, - skip_object, + visibility, max_hits, num_hits); } @@ -264,7 +264,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, return bvh_intersect_shadow_all_hair(kg, ray, isect, - skip_object, + visibility, max_hits, num_hits); } @@ -275,7 +275,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, return bvh_intersect_shadow_all_instancing(kg, ray, isect, - skip_object, + visibility, max_hits, num_hits); } @@ -284,7 +284,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, return bvh_intersect_shadow_all(kg, ray, isect, - skip_object, + visibility, max_hits, num_hits); } diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h index 74a9ebf14e4..6c33dad5426 100644 --- a/intern/cycles/kernel/bvh/bvh_nodes.h +++ b/intern/cycles/kernel/bvh/bvh_nodes.h @@ -52,8 +52,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, float c0hiy = (node1.z - P.y) * idir.y; float c0loz = (node2.x - P.z) * idir.z; float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); + float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); float c1lox = (node0.y - P.x) * idir.x; float c1hix = (node0.w - P.x) * idir.x; @@ -61,8 +61,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, float c1hiy = (node1.w - P.y) * idir.y; float c1loz = (node2.y - P.z) * idir.z; float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); + float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); dist[0] = c0min; dist[1] = c1min; @@ -101,8 +101,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, float c0hiy = (node1.z - P.y) * idir.y; float c0loz = (node2.x - P.z) * idir.z; float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); + float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); float c1lox = (node0.y - P.x) * idir.x; float c1hix = (node0.w - P.x) * idir.x; @@ -110,8 +110,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, float c1hiy = (node1.w - P.y) * idir.y; float c1loz = (node2.y - P.z) * idir.z; float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); + float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); if(difl != 0.0f) { float hdiff = 1.0f + difl; @@ -483,8 +483,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, ssef tfar_y = max(lower_y, upper_y); ssef tfar_z = max(lower_z, upper_z); - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); sseb vmask = tnear <= tfar; dist[0] = tnear.f[0]; dist[1] = tnear.f[1]; @@ -545,8 +545,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg ssef tfar_y = max(lower_y, upper_y); ssef tfar_z = max(lower_z, upper_z); - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); sseb vmask; if(difl != 0.0f) { const float round_down = 1.0f - difl; @@ -615,7 +615,7 @@ ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, const float3& P, const float3& dir, const ssef& isect_near, - const ssef& isect_far, + const ssef& isect_far, const ssef& tsplat, const ssef Psplat[3], const ssef idirsplat[3], diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index 267e098f912..a6a4353562c 100644 --- a/intern/cycles/kernel/bvh/bvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -45,7 +45,7 @@ ccl_device_inline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, - const int skip_object, + const uint visibility, const uint max_hits, uint *num_hits) { @@ -119,7 +119,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, idir, isect_t, node_addr, - PATH_RAY_SHADOW, + visibility, dist); #else // __KERNEL_SSE2__ traverse_mask = NODE_INTERSECT(kg, @@ -134,7 +134,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, idirsplat, shufflexyz, node_addr, - PATH_RAY_SHADOW, + visibility, dist); #endif // __KERNEL_SSE2__ @@ -186,17 +186,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* primitive intersection */ while(prim_addr < prim_addr2) { kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); - -#ifdef __SHADOW_TRICKS__ - uint tri_object = (object == OBJECT_NONE) - ? kernel_tex_fetch(__prim_object, prim_addr) - : object; - if(tri_object == skip_object) { - ++prim_addr; - continue; - } -#endif - bool hit; /* todo: specialized intersect functions which don't fill in @@ -209,7 +198,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, isect_array, P, dir, - PATH_RAY_SHADOW, + visibility, object, prim_addr); break; @@ -221,7 +210,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, P, dir, ray->time, - PATH_RAY_SHADOW, + visibility, object, prim_addr); break; @@ -232,30 +221,30 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, case PRIMITIVE_MOTION_CURVE: { const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); + hit = cardinal_curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } else { - hit = bvh_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); + hit = curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } break; } @@ -402,7 +391,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, - const int skip_object, + const uint visibility, const uint max_hits, uint *num_hits) { @@ -411,7 +400,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect_array, - skip_object, + visibility, max_hits, num_hits); } @@ -422,7 +411,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, - skip_object, + visibility, max_hits, num_hits); } diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index c58d3b0316c..ae8f54821f2 100644 --- a/intern/cycles/kernel/bvh/bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -244,14 +244,14 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, { /* shadow ray early termination */ #if defined(__KERNEL_SSE2__) - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) tfar = ssef(isect->t); # endif #else - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; #endif } @@ -274,14 +274,14 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, { /* shadow ray early termination */ # if defined(__KERNEL_SSE2__) - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) tfar = ssef(isect->t); # endif # else - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; # endif } @@ -298,44 +298,44 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); bool hit; if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); + hit = cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } else { - hit = bvh_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); + hit = curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } if(hit) { /* shadow ray early termination */ # if defined(__KERNEL_SSE2__) - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) tfar = ssef(isect->t); # endif # else - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; # endif } diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h index 6d22f0b0d6a..3036efd4198 100644 --- a/intern/cycles/kernel/bvh/qbvh_nodes.h +++ b/intern/cycles/kernel/bvh/qbvh_nodes.h @@ -126,8 +126,8 @@ ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg const sseb vmask = cast(tnear) > cast(tfar); int mask = (int)movemask(vmask)^0xf; #else - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); const sseb vmask = tnear <= tfar; int mask = (int)movemask(vmask); #endif @@ -174,8 +174,8 @@ ccl_device_inline int qbvh_aligned_node_intersect_robust( const float round_down = 1.0f - difl; const float round_up = 1.0f + difl; - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); const sseb vmask = round_down*tnear <= round_up*tfar; *dist = tnear; return (int)movemask(vmask); diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h index ce474438f2c..522213f30ca 100644 --- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h @@ -33,7 +33,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, - const int skip_object, + const uint visibility, const uint max_hits, uint *num_hits) { @@ -107,7 +107,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(false #ifdef __VISIBILITY_FLAG__ - || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) + || ((__float_as_uint(inodes.x) & visibility) == 0) #endif #if BVH_FEATURE(BVH_MOTION) || UNLIKELY(ray->time < inodes.y) @@ -244,7 +244,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(node_addr < 0) { float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); #ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) { + if((__float_as_uint(leaf.z) & visibility) == 0) { /* Pop. */ node_addr = traversal_stack[stack_ptr].addr; --stack_ptr; @@ -268,17 +268,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Primitive intersection. */ while(prim_addr < prim_addr2) { kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); - -#ifdef __SHADOW_TRICKS__ - uint tri_object = (object == OBJECT_NONE) - ? kernel_tex_fetch(__prim_object, prim_addr) - : object; - if(tri_object == skip_object) { - ++prim_addr; - continue; - } -#endif - bool hit; /* todo: specialized intersect functions which don't fill in @@ -291,7 +280,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, isect_array, P, dir, - PATH_RAY_SHADOW, + visibility, object, prim_addr); break; @@ -303,7 +292,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, P, dir, ray->time, - PATH_RAY_SHADOW, + visibility, object, prim_addr); break; @@ -314,30 +303,30 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, case PRIMITIVE_MOTION_CURVE: { const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); + hit = cardinal_curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } else { - hit = bvh_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); + hit = curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } break; } diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h index fca75a1d416..335a4afd47a 100644 --- a/intern/cycles/kernel/bvh/qbvh_traversal.h +++ b/intern/cycles/kernel/bvh/qbvh_traversal.h @@ -340,7 +340,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, prim_addr)) { tfar = ssef(isect->t); /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) { + if(visibility & PATH_RAY_SHADOW_OPAQUE) { return true; } } @@ -362,7 +362,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, prim_addr)) { tfar = ssef(isect->t); /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) { + if(visibility & PATH_RAY_SHADOW_OPAQUE) { return true; } } @@ -379,37 +379,37 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); bool hit; if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); + hit = cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } else { - hit = bvh_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); + hit = curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } if(hit) { tfar = ssef(isect->t); /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) { + if(visibility & PATH_RAY_SHADOW_OPAQUE) { return true; } } diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h index 22d0092093a..2f2c35d5d1f 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h @@ -40,20 +40,20 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha) } /* Sample slope distribution (based on page 14 of the supplemental implementation). */ -ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU) +ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float randx, const float randy) { if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) { - const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f)); - const float phi = M_2PI_F * randU.y; + const float r = sqrtf(randx / max(1.0f - randx, 1e-7f)); + const float phi = M_2PI_F * randy; return make_float2(r*cosf(phi), r*sinf(phi)); } - const float sinI = sqrtf(1.0f - cosI*cosI); + const float sinI = safe_sqrtf(1.0f - cosI*cosI); const float tanI = sinI/cosI; const float projA = 0.5f * (cosI + 1.0f); if(projA < 0.0001f) return make_float2(0.0f, 0.0f); - const float A = 2.0f*randU.x*projA / cosI - 1.0f; + const float A = 2.0f*randx*projA / cosI - 1.0f; float tmp = A*A-1.0f; if(fabsf(tmp) < 1e-7f) return make_float2(0.0f, 0.0f); @@ -64,24 +64,24 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 ran const float slopeX = (A < 0.0f || slopeX2 > 1.0f/tanI)? (tanI*tmp - D) : slopeX2; float U2; - if(randU.y >= 0.5f) - U2 = 2.0f*(randU.y - 0.5f); + if(randy >= 0.5f) + U2 = 2.0f*(randy - 0.5f); else - U2 = 2.0f*(0.5f - randU.y); + U2 = 2.0f*(0.5f - randy); const float z = (U2*(U2*(U2*0.27385f-0.73369f)+0.46341f)) / (U2*(U2*(U2*0.093073f+0.309420f)-1.0f)+0.597999f); const float slopeY = z * sqrtf(1.0f + slopeX*slopeX); - if(randU.y >= 0.5f) + if(randy >= 0.5f) return make_float2(slopeX, slopeY); else return make_float2(slopeX, -slopeY); } /* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */ -ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU) +ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float randx, const float randy) { const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z)); - const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU); + const float2 slope_11 = mf_sampleP22_11(wi_11.z, randx, randy); const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f)); const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y); @@ -474,6 +474,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC *eval *= *pdf; *omega_in = X*localO.x + Y*localO.y + Z*localO.z; + #ifdef __RAY_DIFFERENTIALS__ *domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx; *domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h index 2eb2457c9e5..e73915dbda7 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h @@ -100,11 +100,14 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( bool outside = true; for(int order = 0; order < 10; order++) { - /* Sample microfacet height and normal */ - if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) + /* Sample microfacet height. */ + float height_rand = lcg_step_float_addrspace(lcg_state); + if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) break; - float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state))); + /* Sample microfacet normal. */ + float vndf_rand_y = lcg_step_float_addrspace(lcg_state); + float vndf_rand_x = lcg_step_float_addrspace(lcg_state); + float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y); #ifdef MF_MULTI_GLASS if(order == 0 && use_fresnel) { @@ -136,7 +139,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( #ifdef MF_MULTI_GLASS bool next_outside; float3 wi_prev = -wr; - wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside); + float phase_rand = lcg_step_float_addrspace(lcg_state); + wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside); if(!next_outside) { outside = !outside; wr = -wr; @@ -204,14 +208,16 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)( int order; for(order = 0; order < 10; order++) { /* Sample microfacet height. */ - if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) { + float height_rand = lcg_step_float_addrspace(lcg_state); + if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) { /* The random walk has left the surface. */ *wo = outside? wr: -wr; return throughput; } /* Sample microfacet normal. */ - float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state))); + float vndf_rand_y = lcg_step_float_addrspace(lcg_state); + float vndf_rand_x = lcg_step_float_addrspace(lcg_state); + float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y); /* First-bounce color is already accounted for in mix weight. */ if(!use_fresnel && order > 0) @@ -221,7 +227,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)( #ifdef MF_MULTI_GLASS bool next_outside; float3 wi_prev = -wr; - wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside); + float phase_rand = lcg_step_float_addrspace(lcg_state); + wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside); if(!next_outside) { hr = -hr; wr = -wr; diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h index f733ea4c517..267aeea6e86 100644 --- a/intern/cycles/kernel/closure/bssrdf.h +++ b/intern/cycles/kernel/closure/bssrdf.h @@ -348,8 +348,9 @@ ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight) { Bssrdf *bssrdf = (Bssrdf*)closure_alloc(sd, sizeof(Bssrdf), CLOSURE_NONE_ID, weight); - if(!bssrdf) + if(bssrdf == NULL) { return NULL; + } float sample_weight = fabsf(average(weight)); bssrdf->sample_weight = sample_weight; @@ -399,7 +400,7 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type) bssrdf_burley_setup(bssrdf); } - return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF; + return SD_BSSRDF; } } diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h index 3185330994c..3ddd8712266 100644 --- a/intern/cycles/kernel/filter/filter_features_sse.h +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride) +#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride) /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. * pixel_buffer always points to the first of the 4 current pixel in the first pass. @@ -24,25 +24,25 @@ CCL_NAMESPACE_BEGIN #define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - __m128 y4 = _mm_set1_ps(pixel.y); \ + float4 y4 = make_float4(pixel.y); \ for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ - __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \ - __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x)); + float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ + int4 active_pixels = x4 < make_float4(high.x); #define END_FOR_PIXEL_WINDOW_SSE } \ pixel_buffer += buffer_w - (pixel.x - low.x); \ } -ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, - __m128 active_pixels, +ccl_device_inline void filter_get_features_sse(float4 x, float4 y, + int4 active_pixels, const float *ccl_restrict buffer, - __m128 *features, - const __m128 *ccl_restrict mean, + float4 *features, + const float4 *ccl_restrict mean, int pass_stride) { features[0] = x; features[1] = y; - features[2] = _mm_fabs_ps(ccl_get_feature_sse(0)); + features[2] = fabs(ccl_get_feature_sse(0)); features[3] = ccl_get_feature_sse(1); features[4] = ccl_get_feature_sse(2); features[5] = ccl_get_feature_sse(3); @@ -52,53 +52,41 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, features[9] = ccl_get_feature_sse(7); if(mean) { for(int i = 0; i < DENOISE_FEATURES; i++) - features[i] = _mm_sub_ps(features[i], mean[i]); + features[i] = features[i] - mean[i]; } for(int i = 0; i < DENOISE_FEATURES; i++) - features[i] = _mm_mask_ps(features[i], active_pixels); + features[i] = mask(active_pixels, features[i]); } -ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, - __m128 active_pixels, +ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, + int4 active_pixels, const float *ccl_restrict buffer, - __m128 *scales, - const __m128 *ccl_restrict mean, + float4 *scales, + const float4 *ccl_restrict mean, int pass_stride) { - scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels); - scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels); - - scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels); - - __m128 diff, scale; - diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]); - scale = _mm_mul_ps(diff, diff); - diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - scales[3] = _mm_mask_ps(scale, active_pixels); - - scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels); - - diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]); - scale = _mm_mul_ps(diff, diff); - diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]); - scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); - scales[5] = _mm_mask_ps(scale, active_pixels); + scales[0] = fabs(x - mean[0]); + scales[1] = fabs(y - mean[1]); + scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]); + scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + + sqr(ccl_get_feature_sse(2) - mean[4]) + + sqr(ccl_get_feature_sse(3) - mean[5]); + scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]); + scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + + sqr(ccl_get_feature_sse(6) - mean[8]) + + sqr(ccl_get_feature_sse(7) - mean[9]); + for(int i = 0; i < 6; i++) + scales[i] = mask(active_pixels, scales[i]); } -ccl_device_inline void filter_calculate_scale_sse(__m128 *scale) +ccl_device_inline void filter_calculate_scale_sse(float4 *scale) { - scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f))); - scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f))); - scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f))); - scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f))); - - scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f))); - scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f))); + scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); + scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); + scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); + scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); + scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); + scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); } diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index 3e752bce68f..5e989331bc2 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -50,10 +50,8 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen int w, int f) { -#ifdef __KERNEL_SSE3__ - int aligned_lowx = (rect.x & ~(3)); - int aligned_highx = ((rect.z + 3) & ~(3)); -#endif + int aligned_lowx = rect.x / 4; + int aligned_highx = (rect.z + 3) / 4; for(int y = rect.y; y < rect.w; y++) { const int low = max(rect.y, y-f); const int high = min(rect.w, y+f+1); @@ -61,15 +59,11 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen out_image[y*w+x] = 0.0f; } for(int y1 = low; y1 < high; y1++) { -#ifdef __KERNEL_SSE3__ - for(int x = aligned_lowx; x < aligned_highx; x+=4) { - _mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x))); + float4* out_image4 = (float4*)(out_image + y*w); + float4* difference_image4 = (float4*)(difference_image + y1*w); + for(int x = aligned_lowx; x < aligned_highx; x++) { + out_image4[x] += difference_image4[x]; } -#else - for(int x = rect.x; x < rect.z; x++) { - out_image[y*w+x] += difference_image[y1*w+x]; - } -#endif } for(int x = rect.x; x < rect.z; x++) { out_image[y*w+x] *= 1.0f/(high - low); diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h index d5ae1b73927..2aeb54a62be 100644 --- a/intern/cycles/kernel/filter/filter_prefilter.h +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -61,8 +61,8 @@ ccl_device void kernel_filter_divide_shadow(int sample, varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample); varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample); } - varA /= (odd_sample - 1); - varB /= (even_sample - 1); + varA /= max(odd_sample - 1, 1); + varB /= max(even_sample - 1, 1); sampleVariance[idx] = 0.5f*(varA + varB) / sample; sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample); @@ -96,11 +96,17 @@ ccl_device void kernel_filter_get_feature(int sample, int idx = (y-rect.y)*buffer_w + (x - rect.x); mean[idx] = center_buffer[m_offset] / sample; - if(use_split_variance) { - variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); + if(sample > 1) { + if(use_split_variance) { + variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); + } + else { + variance[idx] = center_buffer[v_offset] / (sample * (sample-1)); + } } else { - variance[idx] = center_buffer[v_offset] / (sample * (sample-1)); + /* Can't compute variance with single sample, just set it very high. */ + variance[idx] = 1e10f; } } @@ -114,49 +120,56 @@ ccl_device void kernel_filter_detect_outliers(int x, int y, { int buffer_w = align_up(rect.z - rect.x, 4); - int n = 0; - float values[25]; - for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) { - for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) { - int idx = (y1-rect.y)*buffer_w + (x1-rect.x); - float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride])); - - /* Find the position of L. */ - int i; - for(i = 0; i < n; i++) { - if(values[i] > L) break; - } - /* Make space for L by shifting all following values to the right. */ - for(int j = n; j > i; j--) { - values[j] = values[j-1]; - } - /* Insert L. */ - values[i] = L; - n++; - } - } - int idx = (y-rect.y)*buffer_w + (x-rect.x); - float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride])); + float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]); - float ref = 2.0f*values[(int)(n*0.75f)]; float fac = 1.0f; - if(L > ref) { - /* The pixel appears to be an outlier. - * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel - * should actually be at the reference value: - * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier. - * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight. - */ - float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride]))); - if(L - 3*stddev < ref) { - /* The pixel is an outlier, so negate the depth value to mark it as one. - * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ - depth[idx] = -depth[idx]; - fac = ref/L; - variance[idx ] *= fac*fac; - variance[idx + pass_stride] *= fac*fac; - variance[idx+2*pass_stride] *= fac*fac; + if(color.x < 0.0f || color.y < 0.0f || color.z < 0.0f) { + depth[idx] = -depth[idx]; + fac = 0.0f; + } + else { + float L = average(color); + int n = 0; + float values[25]; + for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) { + for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) { + int idx = (y1-rect.y)*buffer_w + (x1-rect.x); + float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride])); + + /* Find the position of L. */ + int i; + for(i = 0; i < n; i++) { + if(values[i] > L) break; + } + /* Make space for L by shifting all following values to the right. */ + for(int j = n; j > i; j--) { + values[j] = values[j-1]; + } + /* Insert L. */ + values[i] = L; + n++; + } + } + + float ref = 2.0f*values[(int)(n*0.75f)]; + if(L > ref) { + /* The pixel appears to be an outlier. + * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel + * should actually be at the reference value: + * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier. + * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight. + */ + float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride]))); + if(L - 3*stddev < ref) { + /* The pixel is an outlier, so negate the depth value to mark it as one. + * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ + depth[idx] = -depth[idx]; + fac = ref/L; + variance[idx ] *= fac*fac; + variance[idx + pass_stride] *= fac*fac; + variance[idx+2*pass_stride] *= fac*fac; + } } } out[idx ] = fac*image[idx]; diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h index 30dc2969b11..9e65f61664b 100644 --- a/intern/cycles/kernel/filter/filter_transform_sse.h +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -24,7 +24,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff { int buffer_w = align_up(rect.z - rect.x, 4); - __m128 features[DENOISE_FEATURES]; + float4 features[DENOISE_FEATURES]; const float *ccl_restrict pixel_buffer; int2 pixel; @@ -34,19 +34,19 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff min(rect.w, y + radius + 1)); int num_pixels = (high.y - low.y) * (high.x - low.x); - __m128 feature_means[DENOISE_FEATURES]; + float4 feature_means[DENOISE_FEATURES]; math_vector_zero_sse(feature_means, DENOISE_FEATURES); FOR_PIXEL_WINDOW_SSE { filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride); math_vector_add_sse(feature_means, DENOISE_FEATURES, features); } END_FOR_PIXEL_WINDOW_SSE - __m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels); + float4 pixel_scale = make_float4(1.0f / num_pixels); for(int i = 0; i < DENOISE_FEATURES; i++) { - feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale); + feature_means[i] = reduce_add(feature_means[i]) * pixel_scale; } - __m128 feature_scale[DENOISE_FEATURES]; + float4 feature_scale[DENOISE_FEATURES]; math_vector_zero_sse(feature_scale, DENOISE_FEATURES); FOR_PIXEL_WINDOW_SSE { filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); @@ -55,12 +55,12 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff filter_calculate_scale_sse(feature_scale); - __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; + float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES); FOR_PIXEL_WINDOW_SSE { filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale); - math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f)); + math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f)); } END_FOR_PIXEL_WINDOW_SSE float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; @@ -98,7 +98,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff /* Bake the feature scaling into the transformation matrix. */ for(int i = 0; i < DENOISE_FEATURES; i++) { - math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank); + math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank); } } diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h index c623e3490fd..f34b77ebc07 100644 --- a/intern/cycles/kernel/geom/geom.h +++ b/intern/cycles/kernel/geom/geom.h @@ -27,6 +27,7 @@ #include "kernel/geom/geom_motion_triangle_shader.h" #include "kernel/geom/geom_motion_curve.h" #include "kernel/geom/geom_curve.h" +#include "kernel/geom/geom_curve_intersect.h" #include "kernel/geom/geom_volume.h" #include "kernel/geom/geom_primitive.h" diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index 5c3b0ee3c15..e35267f02bf 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -16,18 +16,13 @@ CCL_NAMESPACE_BEGIN /* Curve Primitive * - * Curve primitive for rendering hair and fur. These can be render as flat ribbons - * or curves with actual thickness. The curve can also be rendered as line segments - * rather than curves for better performance */ + * Curve primitive for rendering hair and fur. These can be render as flat + * ribbons or curves with actual thickness. The curve can also be rendered as + * line segments rather than curves for better performance. + */ #ifdef __HAIR__ -#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300) -# define ccl_device_curveintersect ccl_device -#else -# define ccl_device_curveintersect ccl_device_forceinline -#endif - /* Reading attributes on various curve elements */ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy) @@ -151,7 +146,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd /* Curve tangent normal */ ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd) -{ +{ float3 tgN = make_float3(0.0f,0.0f,0.0f); if(sd->type & PRIMITIVE_ALL_CURVE) { @@ -219,893 +214,6 @@ ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta, } } -#ifdef __KERNEL_SSE2__ -ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a) -{ - return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2])); -} -#endif - -#ifdef __KERNEL_SSE2__ -/* Pass P and dir by reference to aligned vector */ -ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, - const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) -#else -ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax) -#endif -{ - const bool is_curve_primitive = (type & PRIMITIVE_CURVE); - - if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { - const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); - if(time < prim_time.x || time > prim_time.y) { - return false; - } - } - - int segment = PRIMITIVE_UNPACK_SEGMENT(type); - float epsilon = 0.0f; - float r_st, r_en; - - int depth = kernel_data.curve.subdivisions; - int flags = kernel_data.curve.curveflags; - int prim = kernel_tex_fetch(__prim_index, curveAddr); - -#ifdef __KERNEL_SSE2__ - ssef vdir = load4f(dir); - ssef vcurve_coef[4]; - const float3 *curve_coef = (float3 *)vcurve_coef; - - { - ssef dtmp = vdir * vdir; - ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp)); - ssef rd_ss = load1f_first(1.0f) / d_ss; - - ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]); - int2 &v00 = (int2 &)v00vec; - - int k0 = v00.x + segment; - int k1 = k0 + 1; - int ka = max(k0 - 1, v00.x); - int kb = min(k1 + 1, v00.x + v00.y - 1); - -#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800) - avxf P_curve_0_1, P_curve_2_3; - if(is_curve_primitive) { - P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x); - P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x); - } - else { - int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object; - motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3); - } -#else /* __KERNEL_AVX2__ */ - ssef P_curve[4]; - - if(is_curve_primitive) { - P_curve[0] = load4f(&kg->__curve_keys.data[ka].x); - P_curve[1] = load4f(&kg->__curve_keys.data[k0].x); - P_curve[2] = load4f(&kg->__curve_keys.data[k1].x); - P_curve[3] = load4f(&kg->__curve_keys.data[kb].x); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve); - } -#endif /* __KERNEL_AVX2__ */ - - ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss)); - ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn; - ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy; - ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz); - ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)); - - ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0); - ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0); - ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); - -#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800) - const avxf vPP = _mm256_broadcast_ps(&P.m128); - const avxf htfm00 = avxf(htfm0.m128, htfm0.m128); - const avxf htfm11 = avxf(htfm1.m128, htfm1.m128); - const avxf htfm22 = avxf(htfm2.m128, htfm2.m128); - - const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP), - htfm00, - madd(shuffle<1>(P_curve_0_1 - vPP), - htfm11, - shuffle<2>(P_curve_0_1 - vPP) * htfm22)); - const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP), - htfm00, - madd(shuffle<1>(P_curve_2_3 - vPP), - htfm11, - shuffle<2>(P_curve_2_3 - vPP)*htfm22)); - - const ssef p0 = _mm256_castps256_ps128(p01); - const ssef p1 = _mm256_extractf128_ps(p01, 1); - const ssef p2 = _mm256_castps256_ps128(p23); - const ssef p3 = _mm256_extractf128_ps(p23, 1); - - const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1); - r_st = ((float4 &)P_curve_1).w; - const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3); - r_en = ((float4 &)P_curve_2).w; -#else /* __KERNEL_AVX2__ */ - ssef htfm[] = { htfm0, htfm1, htfm2 }; - ssef vP = load4f(P); - ssef p0 = transform_point_T3(htfm, P_curve[0] - vP); - ssef p1 = transform_point_T3(htfm, P_curve[1] - vP); - ssef p2 = transform_point_T3(htfm, P_curve[2] - vP); - ssef p3 = transform_point_T3(htfm, P_curve[3] - vP); - - r_st = ((float4 &)P_curve[1]).w; - r_en = ((float4 &)P_curve[2]).w; -#endif /* __KERNEL_AVX2__ */ - - float fc = 0.71f; - ssef vfc = ssef(fc); - ssef vfcxp3 = vfc * p3; - - vcurve_coef[0] = p1; - vcurve_coef[1] = vfc * (p2 - p0); - vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3))); - vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3)); - - } -#else - float3 curve_coef[4]; - - /* curve Intersection check */ - /* obtain curve parameters */ - { - /* ray transform created - this should be created at beginning of intersection loop */ - Transform htfm; - float d = sqrtf(dir.x * dir.x + dir.z * dir.z); - htfm = make_transform( - dir.z / d, 0, -dir.x /d, 0, - -dir.x * dir.y /d, d, -dir.y * dir.z /d, 0, - dir.x, dir.y, dir.z, 0, - 0, 0, 0, 1); - - float4 v00 = kernel_tex_fetch(__curves, prim); - - int k0 = __float_as_int(v00.x) + segment; - int k1 = k0 + 1; - - int ka = max(k0 - 1,__float_as_int(v00.x)); - int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); - - float4 P_curve[4]; - - if(is_curve_primitive) { - P_curve[0] = kernel_tex_fetch(__curve_keys, ka); - P_curve[1] = kernel_tex_fetch(__curve_keys, k0); - P_curve[2] = kernel_tex_fetch(__curve_keys, k1); - P_curve[3] = kernel_tex_fetch(__curve_keys, kb); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve); - } - - float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P); - float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P); - float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P); - float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P); - - float fc = 0.71f; - curve_coef[0] = p1; - curve_coef[1] = -fc*p0 + fc*p2; - curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3; - curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3; - r_st = P_curve[1].w; - r_en = P_curve[2].w; - } -#endif - - float r_curr = max(r_st, r_en); - - if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING)) - epsilon = 2 * r_curr; - - /* find bounds - this is slow for cubic curves */ - float upper, lower; - - float zextrem[4]; - curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z); - if(lower - r_curr > isect->t || upper + r_curr < epsilon) - return false; - - /* minimum width extension */ - float mw_extension = min(difl * fabsf(upper), extmax); - float r_ext = mw_extension + r_curr; - - float xextrem[4]; - curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x); - if(lower > r_ext || upper < -r_ext) - return false; - - float yextrem[4]; - curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y); - if(lower > r_ext || upper < -r_ext) - return false; - - /* setup recurrent loop */ - int level = 1 << depth; - int tree = 0; - float resol = 1.0f / (float)level; - bool hit = false; - - /* begin loop */ - while(!(tree >> (depth))) { - const float i_st = tree * resol; - const float i_en = i_st + (level * resol); - -#ifdef __KERNEL_SSE2__ - ssef vi_st = ssef(i_st), vi_en = ssef(i_en); - ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]); - ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]); - - ssef vbmin = min(vp_st, vp_en); - ssef vbmax = max(vp_st, vp_en); - - float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax; - float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z; - float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z; - float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en; -#else - float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0]; - float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0]; - - float bminx = min(p_st.x, p_en.x); - float bmaxx = max(p_st.x, p_en.x); - float bminy = min(p_st.y, p_en.y); - float bmaxy = max(p_st.y, p_en.y); - float bminz = min(p_st.z, p_en.z); - float bmaxz = max(p_st.z, p_en.z); -#endif - - if(xextrem[0] >= i_st && xextrem[0] <= i_en) { - bminx = min(bminx,xextrem[1]); - bmaxx = max(bmaxx,xextrem[1]); - } - if(xextrem[2] >= i_st && xextrem[2] <= i_en) { - bminx = min(bminx,xextrem[3]); - bmaxx = max(bmaxx,xextrem[3]); - } - if(yextrem[0] >= i_st && yextrem[0] <= i_en) { - bminy = min(bminy,yextrem[1]); - bmaxy = max(bmaxy,yextrem[1]); - } - if(yextrem[2] >= i_st && yextrem[2] <= i_en) { - bminy = min(bminy,yextrem[3]); - bmaxy = max(bmaxy,yextrem[3]); - } - if(zextrem[0] >= i_st && zextrem[0] <= i_en) { - bminz = min(bminz,zextrem[1]); - bmaxz = max(bmaxz,zextrem[1]); - } - if(zextrem[2] >= i_st && zextrem[2] <= i_en) { - bminz = min(bminz,zextrem[3]); - bmaxz = max(bmaxz,zextrem[3]); - } - - float r1 = r_st + (r_en - r_st) * i_st; - float r2 = r_st + (r_en - r_st) * i_en; - r_curr = max(r1, r2); - - mw_extension = min(difl * fabsf(bmaxz), extmax); - float r_ext = mw_extension + r_curr; - float coverage = 1.0f; - - if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) { - /* the bounding box does not overlap the square centered at O */ - tree += level; - level = tree & -tree; - } - else if(level == 1) { - - /* the maximum recursion depth is reached. - * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. - * dP* is reversed if necessary.*/ - float t = isect->t; - float u = 0.0f; - float gd = 0.0f; - - if(flags & CURVE_KN_RIBBONS) { - float3 tg = (p_en - p_st); -#ifdef __KERNEL_SSE__ - const float3 tg_sq = tg * tg; - float w = tg_sq.x + tg_sq.y; -#else - float w = tg.x * tg.x + tg.y * tg.y; -#endif - if(w == 0) { - tree++; - level = tree & -tree; - continue; - } -#ifdef __KERNEL_SSE__ - const float3 p_sttg = p_st * tg; - w = -(p_sttg.x + p_sttg.y) / w; -#else - w = -(p_st.x * tg.x + p_st.y * tg.y) / w; -#endif - w = saturate(w); - - /* compute u on the curve segment */ - u = i_st * (1 - w) + i_en * w; - r_curr = r_st + (r_en - r_st) * u; - /* compare x-y distances */ - float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0]; - - float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; - if(dot(tg, dp_st)< 0) - dp_st *= -1; - if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) { - tree++; - level = tree & -tree; - continue; - } - float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; - if(dot(tg, dp_en) < 0) - dp_en *= -1; - if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) { - tree++; - level = tree & -tree; - continue; - } - - /* compute coverage */ - float r_ext = r_curr; - coverage = 1.0f; - if(difl != 0.0f) { - mw_extension = min(difl * fabsf(bmaxz), extmax); - r_ext = mw_extension + r_curr; -#ifdef __KERNEL_SSE__ - const float3 p_curr_sq = p_curr * p_curr; - const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128))); - float d = dxxx.x; -#else - float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y); -#endif - float d0 = d - r_curr; - float d1 = d + r_curr; - float inv_mw_extension = 1.0f/mw_extension; - if(d0 >= 0) - coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f; - else // inside - coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f; - } - - if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) { - tree++; - level = tree & -tree; - continue; - } - - t = p_curr.z; - - /* stochastic fade from minimum width */ - if(difl != 0.0f && lcg_state) { - if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) - return hit; - } - } - else { - float l = len(p_en - p_st); - /* minimum width extension */ - float or1 = r1; - float or2 = r2; - - if(difl != 0.0f) { - mw_extension = min(len(p_st - P) * difl, extmax); - or1 = r1 < mw_extension ? mw_extension : r1; - mw_extension = min(len(p_en - P) * difl, extmax); - or2 = r2 < mw_extension ? mw_extension : r2; - } - /* --- */ - float invl = 1.0f/l; - float3 tg = (p_en - p_st) * invl; - gd = (or2 - or1) * invl; - float difz = -dot(p_st,tg); - float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd)); - float invcyla = 1.0f/cyla; - float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1))); - float tcentre = -halfb*invcyla; - float zcentre = difz + (tg.z * tcentre); - float3 tdif = - p_st; - tdif.z += tcentre; - float tdifz = dot(tdif,tg); - float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1))); - float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd; - float td = tb*tb - 4*cyla*tc; - if(td < 0.0f) { - tree++; - level = tree & -tree; - continue; - } - - float rootd = sqrtf(td); - float correction = (-tb - rootd) * 0.5f * invcyla; - t = tcentre + correction; - - float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; - if(dot(tg, dp_st)< 0) - dp_st *= -1; - float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; - if(dot(tg, dp_en) < 0) - dp_en *= -1; - - if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) { - correction = (-tb + rootd) * 0.5f * invcyla; - t = tcentre + correction; - } - - if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) { - tree++; - level = tree & -tree; - continue; - } - - float w = (zcentre + (tg.z * correction)) * invl; - w = saturate(w); - /* compute u on the curve segment */ - u = i_st * (1 - w) + i_en * w; - - /* stochastic fade from minimum width */ - if(difl != 0.0f && lcg_state) { - r_curr = r1 + (r2 - r1) * w; - r_ext = or1 + (or2 - or1) * w; - coverage = r_curr/r_ext; - - if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) - return hit; - } - } - /* we found a new intersection */ - -#ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) -#endif - { - /* record intersection */ - isect->t = t; - isect->u = u; - isect->v = gd; - isect->prim = curveAddr; - isect->object = object; - isect->type = type; - hit = true; - } - - tree++; - level = tree & -tree; - } - else { - /* split the curve into two curves and process */ - level = level >> 1; - } - } - - return hit; -} - -ccl_device_curveintersect bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) -{ - /* define few macros to minimize code duplication for SSE */ -#ifndef __KERNEL_SSE2__ -# define len3_squared(x) len_squared(x) -# define len3(x) len(x) -# define dot3(x, y) dot(x, y) -#endif - - const bool is_curve_primitive = (type & PRIMITIVE_CURVE); - - if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { - const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); - if(time < prim_time.x || time > prim_time.y) { - return false; - } - } - - int segment = PRIMITIVE_UNPACK_SEGMENT(type); - /* curve Intersection check */ - int flags = kernel_data.curve.curveflags; - - int prim = kernel_tex_fetch(__prim_index, curveAddr); - float4 v00 = kernel_tex_fetch(__curves, prim); - - int cnum = __float_as_int(v00.x); - int k0 = cnum + segment; - int k1 = k0 + 1; - -#ifndef __KERNEL_SSE2__ - float4 P_curve[2]; - - if(is_curve_primitive) { - P_curve[0] = kernel_tex_fetch(__curve_keys, k0); - P_curve[1] = kernel_tex_fetch(__curve_keys, k1); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve); - } - - float or1 = P_curve[0].w; - float or2 = P_curve[1].w; - float3 p1 = float4_to_float3(P_curve[0]); - float3 p2 = float4_to_float3(P_curve[1]); - - /* minimum width extension */ - float r1 = or1; - float r2 = or2; - float3 dif = P - p1; - float3 dif_second = P - p2; - if(difl != 0.0f) { - float pixelsize = min(len3(dif) * difl, extmax); - r1 = or1 < pixelsize ? pixelsize : or1; - pixelsize = min(len3(dif_second) * difl, extmax); - r2 = or2 < pixelsize ? pixelsize : or2; - } - /* --- */ - - float3 p21_diff = p2 - p1; - float3 sphere_dif1 = (dif + dif_second) * 0.5f; - float3 dir = direction; - float sphere_b_tmp = dot3(dir, sphere_dif1); - float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; -#else - ssef P_curve[2]; - - if(is_curve_primitive) { - P_curve[0] = load4f(&kg->__curve_keys.data[k0].x); - P_curve[1] = load4f(&kg->__curve_keys.data[k1].x); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve); - } - - const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]); - - ssef r12 = or12; - const ssef vP = load4f(P); - const ssef dif = vP - P_curve[0]; - const ssef dif_second = vP - P_curve[1]; - if(difl != 0.0f) { - const ssef len1_sq = len3_squared_splat(dif); - const ssef len2_sq = len3_squared_splat(dif_second); - const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq)); - const ssef pixelsize12 = min(len12 * difl, ssef(extmax)); - r12 = max(or12, pixelsize12); - } - float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12)); - float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12)); - - const ssef p21_diff = P_curve[1] - P_curve[0]; - const ssef sphere_dif1 = (dif + dif_second) * 0.5f; - const ssef dir = load4f(direction); - const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1); - const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1); -#endif - - float mr = max(r1, r2); - float l = len3(p21_diff); - float invl = 1.0f / l; - float sp_r = mr + 0.5f * l; - - float sphere_b = dot3(dir, sphere_dif2); - float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r; - - if(sdisc < 0.0f) - return false; - - /* obtain parameters and test midpoint distance for suitable modes */ -#ifndef __KERNEL_SSE2__ - float3 tg = p21_diff * invl; -#else - const ssef tg = p21_diff * invl; -#endif - float gd = (r2 - r1) * invl; - - float dirz = dot3(dir, tg); - float difz = dot3(dif, tg); - - float a = 1.0f - (dirz*dirz*(1 + gd*gd)); - - float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1)); - - float tcentre = -halfb/a; - float zcentre = difz + (dirz * tcentre); - - if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE)) - return false; - if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION)) - return false; - - /* test minimum separation */ -#ifndef __KERNEL_SSE2__ - float3 cprod = cross(tg, dir); - float cprod2sq = len3_squared(cross(tg, dif)); -#else - const ssef cprod = cross(tg, dir); - float cprod2sq = len3_squared(cross_zxy(tg, dif)); -#endif - float cprodsq = len3_squared(cprod); - float distscaled = dot3(cprod, dif); - - if(cprodsq == 0) - distscaled = cprod2sq; - else - distscaled = (distscaled*distscaled)/cprodsq; - - if(distscaled > mr*mr) - return false; - - /* calculate true intersection */ -#ifndef __KERNEL_SSE2__ - float3 tdif = dif + tcentre * dir; -#else - const ssef tdif = madd(ssef(tcentre), dir, dif); -#endif - float tdifz = dot3(tdif, tg); - float tdifma = tdifz*gd + r1; - float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma)); - float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma; - float td = tb*tb - 4*a*tc; - - if(td < 0.0f) - return false; - - float rootd = 0.0f; - float correction = 0.0f; - if(flags & CURVE_KN_ACCURATE) { - rootd = sqrtf(td); - correction = ((-tb - rootd)/(2*a)); - } - - float t = tcentre + correction; - - if(t < isect->t) { - - if(flags & CURVE_KN_INTERSECTCORRECTION) { - rootd = sqrtf(td); - correction = ((-tb - rootd)/(2*a)); - t = tcentre + correction; - } - - float z = zcentre + (dirz * correction); - // bool backface = false; - - if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) { - // backface = true; - correction = ((-tb + rootd)/(2*a)); - t = tcentre + correction; - z = zcentre + (dirz * correction); - } - - /* stochastic fade from minimum width */ - float adjradius = or1 + z * (or2 - or1) * invl; - adjradius = adjradius / (r1 + z * gd); - if(lcg_state && adjradius != 1.0f) { - if(lcg_step_float(lcg_state) > adjradius) - return false; - } - /* --- */ - - if(t > 0.0f && t < isect->t && z >= 0 && z <= l) { - - if(flags & CURVE_KN_ENCLOSEFILTER) { - float enc_ratio = 1.01f; - if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) { - float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio)); - float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio; - if(a2*c2 < 0.0f) - return false; - } - } - -#ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) -#endif - { - /* record intersection */ - isect->t = t; - isect->u = z*invl; - isect->v = gd; - isect->prim = curveAddr; - isect->object = object; - isect->type = type; - - return true; - } - } - } - - return false; - -#ifndef __KERNEL_SSE2__ -# undef len3_squared -# undef len3 -# undef dot3 -#endif -} - -ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3) -{ - float fc = 0.71f; - float data[4]; - float t2 = t * t; - data[0] = -3.0f * fc * t2 + 4.0f * fc * t - fc; - data[1] = 3.0f * (2.0f - fc) * t2 + 2.0f * (fc - 3.0f) * t; - data[2] = 3.0f * (fc - 2.0f) * t2 + 2.0f * (3.0f - 2.0f * fc) * t + fc; - data[3] = 3.0f * fc * t2 - 2.0f * fc * t; - return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; -} - -ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3) -{ - float data[4]; - float fc = 0.71f; - float t2 = t * t; - float t3 = t2 * t; - data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t; - data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f; - data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t; - data[3] = fc * t3 - fc * t2; - return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; -} - -ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray) -{ - int flag = kernel_data.curve.curveflags; - float t = isect->t; - float3 P = ray->P; - float3 D = ray->D; - - if(isect->object != OBJECT_NONE) { -#ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_itfm; -#else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); -#endif - - P = transform_point(&tfm, P); - D = transform_direction(&tfm, D*t); - D = normalize_len(D, &t); - } - - int prim = kernel_tex_fetch(__prim_index, isect->prim); - float4 v00 = kernel_tex_fetch(__curves, prim); - - int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); - int k1 = k0 + 1; - - float3 tg; - - if(flag & CURVE_KN_INTERPOLATE) { - int ka = max(k0 - 1,__float_as_int(v00.x)); - int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); - - float4 P_curve[4]; - - if(sd->type & PRIMITIVE_CURVE) { - P_curve[0] = kernel_tex_fetch(__curve_keys, ka); - P_curve[1] = kernel_tex_fetch(__curve_keys, k0); - P_curve[2] = kernel_tex_fetch(__curve_keys, k1); - P_curve[3] = kernel_tex_fetch(__curve_keys, kb); - } - else { - motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve); - } - - float3 p[4]; - p[0] = float4_to_float3(P_curve[0]); - p[1] = float4_to_float3(P_curve[1]); - p[2] = float4_to_float3(P_curve[2]); - p[3] = float4_to_float3(P_curve[3]); - - P = P + D*t; - -#ifdef __UV__ - sd->u = isect->u; - sd->v = 0.0f; -#endif - - tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); - - if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) { - sd->Ng = normalize(-(D - tg * (dot(tg, D)))); - } - else { - /* direction from inside to surface of curve */ - float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); - sd->Ng = normalize(P - p_curr); - - /* adjustment for changing radius */ - float gd = isect->v; - - if(gd != 0.0f) { - sd->Ng = sd->Ng - gd * tg; - sd->Ng = normalize(sd->Ng); - } - } - - /* todo: sometimes the normal is still so that this is detected as - * backfacing even if cull backfaces is enabled */ - - sd->N = sd->Ng; - } - else { - float4 P_curve[2]; - - if(sd->type & PRIMITIVE_CURVE) { - P_curve[0]= kernel_tex_fetch(__curve_keys, k0); - P_curve[1]= kernel_tex_fetch(__curve_keys, k1); - } - else { - motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); - } - - float l = 1.0f; - tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l); - - P = P + D*t; - - float3 dif = P - float4_to_float3(P_curve[0]); - -#ifdef __UV__ - sd->u = dot(dif,tg)/l; - sd->v = 0.0f; -#endif - - if(flag & CURVE_KN_TRUETANGENTGNORMAL) { - sd->Ng = -(D - tg * dot(tg, D)); - sd->Ng = normalize(sd->Ng); - } - else { - float gd = isect->v; - - /* direction from inside to surface of curve */ - sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd); - - /* adjustment for changing radius */ - if(gd != 0.0f) { - sd->Ng = sd->Ng - gd * tg; - sd->Ng = normalize(sd->Ng); - } - } - - sd->N = sd->Ng; - } - -#ifdef __DPDU__ - /* dPdu/dPdv */ - sd->dPdu = tg; - sd->dPdv = cross(tg, sd->Ng); -#endif - - if(isect->object != OBJECT_NONE) { -#ifdef __OBJECT_MOTION__ - Transform tfm = sd->ob_tfm; -#else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); -#endif - - P = transform_point(&tfm, P); - } - - return P; -} - -#endif +#endif /* __HAIR__ */ CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h new file mode 100644 index 00000000000..e9a149ea1ab --- /dev/null +++ b/intern/cycles/kernel/geom/geom_curve_intersect.h @@ -0,0 +1,934 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* Curve primitive intersection functions. */ + +#ifdef __HAIR__ + +#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300) +# define ccl_device_curveintersect ccl_device +#else +# define ccl_device_curveintersect ccl_device_forceinline +#endif + +#ifdef __KERNEL_SSE2__ +ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a) +{ + return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2])); +} +#endif + +/* On CPU pass P and dir by reference to aligned vector. */ +ccl_device_curveintersect bool cardinal_curve_intersect( + KernelGlobals *kg, + Intersection *isect, + const float3 ccl_ref P, + const float3 ccl_ref dir, + uint visibility, + int object, + int curveAddr, + float time, + int type, + uint *lcg_state, + float difl, + float extmax) +{ + const bool is_curve_primitive = (type & PRIMITIVE_CURVE); + + if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { + const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); + if(time < prim_time.x || time > prim_time.y) { + return false; + } + } + + int segment = PRIMITIVE_UNPACK_SEGMENT(type); + float epsilon = 0.0f; + float r_st, r_en; + + int depth = kernel_data.curve.subdivisions; + int flags = kernel_data.curve.curveflags; + int prim = kernel_tex_fetch(__prim_index, curveAddr); + +#ifdef __KERNEL_SSE2__ + ssef vdir = load4f(dir); + ssef vcurve_coef[4]; + const float3 *curve_coef = (float3 *)vcurve_coef; + + { + ssef dtmp = vdir * vdir; + ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp)); + ssef rd_ss = load1f_first(1.0f) / d_ss; + + ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]); + int2 &v00 = (int2 &)v00vec; + + int k0 = v00.x + segment; + int k1 = k0 + 1; + int ka = max(k0 - 1, v00.x); + int kb = min(k1 + 1, v00.x + v00.y - 1); + +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800) + avxf P_curve_0_1, P_curve_2_3; + if(is_curve_primitive) { + P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x); + P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x); + } + else { + int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object; + motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3); + } +#else /* __KERNEL_AVX2__ */ + ssef P_curve[4]; + + if(is_curve_primitive) { + P_curve[0] = load4f(&kg->__curve_keys.data[ka].x); + P_curve[1] = load4f(&kg->__curve_keys.data[k0].x); + P_curve[2] = load4f(&kg->__curve_keys.data[k1].x); + P_curve[3] = load4f(&kg->__curve_keys.data[kb].x); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve); + } +#endif /* __KERNEL_AVX2__ */ + + ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss)); + ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn; + ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy; + ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz); + ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)); + + ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0); + ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0); + ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); + +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800) + const avxf vPP = _mm256_broadcast_ps(&P.m128); + const avxf htfm00 = avxf(htfm0.m128, htfm0.m128); + const avxf htfm11 = avxf(htfm1.m128, htfm1.m128); + const avxf htfm22 = avxf(htfm2.m128, htfm2.m128); + + const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP), + htfm00, + madd(shuffle<1>(P_curve_0_1 - vPP), + htfm11, + shuffle<2>(P_curve_0_1 - vPP) * htfm22)); + const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP), + htfm00, + madd(shuffle<1>(P_curve_2_3 - vPP), + htfm11, + shuffle<2>(P_curve_2_3 - vPP)*htfm22)); + + const ssef p0 = _mm256_castps256_ps128(p01); + const ssef p1 = _mm256_extractf128_ps(p01, 1); + const ssef p2 = _mm256_castps256_ps128(p23); + const ssef p3 = _mm256_extractf128_ps(p23, 1); + + const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1); + r_st = ((float4 &)P_curve_1).w; + const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3); + r_en = ((float4 &)P_curve_2).w; +#else /* __KERNEL_AVX2__ */ + ssef htfm[] = { htfm0, htfm1, htfm2 }; + ssef vP = load4f(P); + ssef p0 = transform_point_T3(htfm, P_curve[0] - vP); + ssef p1 = transform_point_T3(htfm, P_curve[1] - vP); + ssef p2 = transform_point_T3(htfm, P_curve[2] - vP); + ssef p3 = transform_point_T3(htfm, P_curve[3] - vP); + + r_st = ((float4 &)P_curve[1]).w; + r_en = ((float4 &)P_curve[2]).w; +#endif /* __KERNEL_AVX2__ */ + + float fc = 0.71f; + ssef vfc = ssef(fc); + ssef vfcxp3 = vfc * p3; + + vcurve_coef[0] = p1; + vcurve_coef[1] = vfc * (p2 - p0); + vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3))); + vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3)); + + } +#else + float3 curve_coef[4]; + + /* curve Intersection check */ + /* obtain curve parameters */ + { + /* ray transform created - this should be created at beginning of intersection loop */ + Transform htfm; + float d = sqrtf(dir.x * dir.x + dir.z * dir.z); + htfm = make_transform( + dir.z / d, 0, -dir.x /d, 0, + -dir.x * dir.y /d, d, -dir.y * dir.z /d, 0, + dir.x, dir.y, dir.z, 0, + 0, 0, 0, 1); + + float4 v00 = kernel_tex_fetch(__curves, prim); + + int k0 = __float_as_int(v00.x) + segment; + int k1 = k0 + 1; + + int ka = max(k0 - 1,__float_as_int(v00.x)); + int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); + + float4 P_curve[4]; + + if(is_curve_primitive) { + P_curve[0] = kernel_tex_fetch(__curve_keys, ka); + P_curve[1] = kernel_tex_fetch(__curve_keys, k0); + P_curve[2] = kernel_tex_fetch(__curve_keys, k1); + P_curve[3] = kernel_tex_fetch(__curve_keys, kb); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve); + } + + float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P); + float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P); + float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P); + float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P); + + float fc = 0.71f; + curve_coef[0] = p1; + curve_coef[1] = -fc*p0 + fc*p2; + curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3; + curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3; + r_st = P_curve[1].w; + r_en = P_curve[2].w; + } +#endif + + float r_curr = max(r_st, r_en); + + if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING)) + epsilon = 2 * r_curr; + + /* find bounds - this is slow for cubic curves */ + float upper, lower; + + float zextrem[4]; + curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z); + if(lower - r_curr > isect->t || upper + r_curr < epsilon) + return false; + + /* minimum width extension */ + float mw_extension = min(difl * fabsf(upper), extmax); + float r_ext = mw_extension + r_curr; + + float xextrem[4]; + curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x); + if(lower > r_ext || upper < -r_ext) + return false; + + float yextrem[4]; + curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y); + if(lower > r_ext || upper < -r_ext) + return false; + + /* setup recurrent loop */ + int level = 1 << depth; + int tree = 0; + float resol = 1.0f / (float)level; + bool hit = false; + + /* begin loop */ + while(!(tree >> (depth))) { + const float i_st = tree * resol; + const float i_en = i_st + (level * resol); + +#ifdef __KERNEL_SSE2__ + ssef vi_st = ssef(i_st), vi_en = ssef(i_en); + ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]); + ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]); + + ssef vbmin = min(vp_st, vp_en); + ssef vbmax = max(vp_st, vp_en); + + float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax; + float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z; + float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z; + float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en; +#else + float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0]; + float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0]; + + float bminx = min(p_st.x, p_en.x); + float bmaxx = max(p_st.x, p_en.x); + float bminy = min(p_st.y, p_en.y); + float bmaxy = max(p_st.y, p_en.y); + float bminz = min(p_st.z, p_en.z); + float bmaxz = max(p_st.z, p_en.z); +#endif + + if(xextrem[0] >= i_st && xextrem[0] <= i_en) { + bminx = min(bminx,xextrem[1]); + bmaxx = max(bmaxx,xextrem[1]); + } + if(xextrem[2] >= i_st && xextrem[2] <= i_en) { + bminx = min(bminx,xextrem[3]); + bmaxx = max(bmaxx,xextrem[3]); + } + if(yextrem[0] >= i_st && yextrem[0] <= i_en) { + bminy = min(bminy,yextrem[1]); + bmaxy = max(bmaxy,yextrem[1]); + } + if(yextrem[2] >= i_st && yextrem[2] <= i_en) { + bminy = min(bminy,yextrem[3]); + bmaxy = max(bmaxy,yextrem[3]); + } + if(zextrem[0] >= i_st && zextrem[0] <= i_en) { + bminz = min(bminz,zextrem[1]); + bmaxz = max(bmaxz,zextrem[1]); + } + if(zextrem[2] >= i_st && zextrem[2] <= i_en) { + bminz = min(bminz,zextrem[3]); + bmaxz = max(bmaxz,zextrem[3]); + } + + float r1 = r_st + (r_en - r_st) * i_st; + float r2 = r_st + (r_en - r_st) * i_en; + r_curr = max(r1, r2); + + mw_extension = min(difl * fabsf(bmaxz), extmax); + float r_ext = mw_extension + r_curr; + float coverage = 1.0f; + + if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) { + /* the bounding box does not overlap the square centered at O */ + tree += level; + level = tree & -tree; + } + else if(level == 1) { + + /* the maximum recursion depth is reached. + * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. + * dP* is reversed if necessary.*/ + float t = isect->t; + float u = 0.0f; + float gd = 0.0f; + + if(flags & CURVE_KN_RIBBONS) { + float3 tg = (p_en - p_st); +#ifdef __KERNEL_SSE__ + const float3 tg_sq = tg * tg; + float w = tg_sq.x + tg_sq.y; +#else + float w = tg.x * tg.x + tg.y * tg.y; +#endif + if(w == 0) { + tree++; + level = tree & -tree; + continue; + } +#ifdef __KERNEL_SSE__ + const float3 p_sttg = p_st * tg; + w = -(p_sttg.x + p_sttg.y) / w; +#else + w = -(p_st.x * tg.x + p_st.y * tg.y) / w; +#endif + w = saturate(w); + + /* compute u on the curve segment */ + u = i_st * (1 - w) + i_en * w; + r_curr = r_st + (r_en - r_st) * u; + /* compare x-y distances */ + float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0]; + + float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; + if(dot(tg, dp_st)< 0) + dp_st *= -1; + if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) { + tree++; + level = tree & -tree; + continue; + } + float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; + if(dot(tg, dp_en) < 0) + dp_en *= -1; + if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) { + tree++; + level = tree & -tree; + continue; + } + + /* compute coverage */ + float r_ext = r_curr; + coverage = 1.0f; + if(difl != 0.0f) { + mw_extension = min(difl * fabsf(bmaxz), extmax); + r_ext = mw_extension + r_curr; +#ifdef __KERNEL_SSE__ + const float3 p_curr_sq = p_curr * p_curr; + const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128))); + float d = dxxx.x; +#else + float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y); +#endif + float d0 = d - r_curr; + float d1 = d + r_curr; + float inv_mw_extension = 1.0f/mw_extension; + if(d0 >= 0) + coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f; + else // inside + coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f; + } + + if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) { + tree++; + level = tree & -tree; + continue; + } + + t = p_curr.z; + + /* stochastic fade from minimum width */ + if(difl != 0.0f && lcg_state) { + if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) + return hit; + } + } + else { + float l = len(p_en - p_st); + /* minimum width extension */ + float or1 = r1; + float or2 = r2; + + if(difl != 0.0f) { + mw_extension = min(len(p_st - P) * difl, extmax); + or1 = r1 < mw_extension ? mw_extension : r1; + mw_extension = min(len(p_en - P) * difl, extmax); + or2 = r2 < mw_extension ? mw_extension : r2; + } + /* --- */ + float invl = 1.0f/l; + float3 tg = (p_en - p_st) * invl; + gd = (or2 - or1) * invl; + float difz = -dot(p_st,tg); + float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd)); + float invcyla = 1.0f/cyla; + float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1))); + float tcentre = -halfb*invcyla; + float zcentre = difz + (tg.z * tcentre); + float3 tdif = - p_st; + tdif.z += tcentre; + float tdifz = dot(tdif,tg); + float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1))); + float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd; + float td = tb*tb - 4*cyla*tc; + if(td < 0.0f) { + tree++; + level = tree & -tree; + continue; + } + + float rootd = sqrtf(td); + float correction = (-tb - rootd) * 0.5f * invcyla; + t = tcentre + correction; + + float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; + if(dot(tg, dp_st)< 0) + dp_st *= -1; + float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; + if(dot(tg, dp_en) < 0) + dp_en *= -1; + + if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) { + correction = (-tb + rootd) * 0.5f * invcyla; + t = tcentre + correction; + } + + if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) { + tree++; + level = tree & -tree; + continue; + } + + float w = (zcentre + (tg.z * correction)) * invl; + w = saturate(w); + /* compute u on the curve segment */ + u = i_st * (1 - w) + i_en * w; + + /* stochastic fade from minimum width */ + if(difl != 0.0f && lcg_state) { + r_curr = r1 + (r2 - r1) * w; + r_ext = or1 + (or2 - or1) * w; + coverage = r_curr/r_ext; + + if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) + return hit; + } + } + /* we found a new intersection */ + +#ifdef __VISIBILITY_FLAG__ + /* visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags */ + if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) +#endif + { + /* record intersection */ + isect->t = t; + isect->u = u; + isect->v = gd; + isect->prim = curveAddr; + isect->object = object; + isect->type = type; + hit = true; + } + + tree++; + level = tree & -tree; + } + else { + /* split the curve into two curves and process */ + level = level >> 1; + } + } + + return hit; +} + +ccl_device_curveintersect bool curve_intersect(KernelGlobals *kg, + Intersection *isect, + float3 P, + float3 direction, + uint visibility, + int object, + int curveAddr, + float time, + int type, + uint *lcg_state, + float difl, + float extmax) +{ + /* define few macros to minimize code duplication for SSE */ +#ifndef __KERNEL_SSE2__ +# define len3_squared(x) len_squared(x) +# define len3(x) len(x) +# define dot3(x, y) dot(x, y) +#endif + + const bool is_curve_primitive = (type & PRIMITIVE_CURVE); + + if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { + const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); + if(time < prim_time.x || time > prim_time.y) { + return false; + } + } + + int segment = PRIMITIVE_UNPACK_SEGMENT(type); + /* curve Intersection check */ + int flags = kernel_data.curve.curveflags; + + int prim = kernel_tex_fetch(__prim_index, curveAddr); + float4 v00 = kernel_tex_fetch(__curves, prim); + + int cnum = __float_as_int(v00.x); + int k0 = cnum + segment; + int k1 = k0 + 1; + +#ifndef __KERNEL_SSE2__ + float4 P_curve[2]; + + if(is_curve_primitive) { + P_curve[0] = kernel_tex_fetch(__curve_keys, k0); + P_curve[1] = kernel_tex_fetch(__curve_keys, k1); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve); + } + + float or1 = P_curve[0].w; + float or2 = P_curve[1].w; + float3 p1 = float4_to_float3(P_curve[0]); + float3 p2 = float4_to_float3(P_curve[1]); + + /* minimum width extension */ + float r1 = or1; + float r2 = or2; + float3 dif = P - p1; + float3 dif_second = P - p2; + if(difl != 0.0f) { + float pixelsize = min(len3(dif) * difl, extmax); + r1 = or1 < pixelsize ? pixelsize : or1; + pixelsize = min(len3(dif_second) * difl, extmax); + r2 = or2 < pixelsize ? pixelsize : or2; + } + /* --- */ + + float3 p21_diff = p2 - p1; + float3 sphere_dif1 = (dif + dif_second) * 0.5f; + float3 dir = direction; + float sphere_b_tmp = dot3(dir, sphere_dif1); + float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; +#else + ssef P_curve[2]; + + if(is_curve_primitive) { + P_curve[0] = load4f(&kg->__curve_keys.data[k0].x); + P_curve[1] = load4f(&kg->__curve_keys.data[k1].x); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve); + } + + const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]); + + ssef r12 = or12; + const ssef vP = load4f(P); + const ssef dif = vP - P_curve[0]; + const ssef dif_second = vP - P_curve[1]; + if(difl != 0.0f) { + const ssef len1_sq = len3_squared_splat(dif); + const ssef len2_sq = len3_squared_splat(dif_second); + const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq)); + const ssef pixelsize12 = min(len12 * difl, ssef(extmax)); + r12 = max(or12, pixelsize12); + } + float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12)); + float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12)); + + const ssef p21_diff = P_curve[1] - P_curve[0]; + const ssef sphere_dif1 = (dif + dif_second) * 0.5f; + const ssef dir = load4f(direction); + const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1); + const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1); +#endif + + float mr = max(r1, r2); + float l = len3(p21_diff); + float invl = 1.0f / l; + float sp_r = mr + 0.5f * l; + + float sphere_b = dot3(dir, sphere_dif2); + float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r; + + if(sdisc < 0.0f) + return false; + + /* obtain parameters and test midpoint distance for suitable modes */ +#ifndef __KERNEL_SSE2__ + float3 tg = p21_diff * invl; +#else + const ssef tg = p21_diff * invl; +#endif + float gd = (r2 - r1) * invl; + + float dirz = dot3(dir, tg); + float difz = dot3(dif, tg); + + float a = 1.0f - (dirz*dirz*(1 + gd*gd)); + + float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1)); + + float tcentre = -halfb/a; + float zcentre = difz + (dirz * tcentre); + + if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE)) + return false; + if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION)) + return false; + + /* test minimum separation */ +#ifndef __KERNEL_SSE2__ + float3 cprod = cross(tg, dir); + float cprod2sq = len3_squared(cross(tg, dif)); +#else + const ssef cprod = cross(tg, dir); + float cprod2sq = len3_squared(cross_zxy(tg, dif)); +#endif + float cprodsq = len3_squared(cprod); + float distscaled = dot3(cprod, dif); + + if(cprodsq == 0) + distscaled = cprod2sq; + else + distscaled = (distscaled*distscaled)/cprodsq; + + if(distscaled > mr*mr) + return false; + + /* calculate true intersection */ +#ifndef __KERNEL_SSE2__ + float3 tdif = dif + tcentre * dir; +#else + const ssef tdif = madd(ssef(tcentre), dir, dif); +#endif + float tdifz = dot3(tdif, tg); + float tdifma = tdifz*gd + r1; + float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma)); + float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma; + float td = tb*tb - 4*a*tc; + + if(td < 0.0f) + return false; + + float rootd = 0.0f; + float correction = 0.0f; + if(flags & CURVE_KN_ACCURATE) { + rootd = sqrtf(td); + correction = ((-tb - rootd)/(2*a)); + } + + float t = tcentre + correction; + + if(t < isect->t) { + + if(flags & CURVE_KN_INTERSECTCORRECTION) { + rootd = sqrtf(td); + correction = ((-tb - rootd)/(2*a)); + t = tcentre + correction; + } + + float z = zcentre + (dirz * correction); + // bool backface = false; + + if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) { + // backface = true; + correction = ((-tb + rootd)/(2*a)); + t = tcentre + correction; + z = zcentre + (dirz * correction); + } + + /* stochastic fade from minimum width */ + float adjradius = or1 + z * (or2 - or1) * invl; + adjradius = adjradius / (r1 + z * gd); + if(lcg_state && adjradius != 1.0f) { + if(lcg_step_float(lcg_state) > adjradius) + return false; + } + /* --- */ + + if(t > 0.0f && t < isect->t && z >= 0 && z <= l) { + + if(flags & CURVE_KN_ENCLOSEFILTER) { + float enc_ratio = 1.01f; + if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) { + float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio)); + float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio; + if(a2*c2 < 0.0f) + return false; + } + } + +#ifdef __VISIBILITY_FLAG__ + /* visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags */ + if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) +#endif + { + /* record intersection */ + isect->t = t; + isect->u = z*invl; + isect->v = gd; + isect->prim = curveAddr; + isect->object = object; + isect->type = type; + + return true; + } + } + } + + return false; + +#ifndef __KERNEL_SSE2__ +# undef len3_squared +# undef len3 +# undef dot3 +#endif +} + +ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3) +{ + float fc = 0.71f; + float data[4]; + float t2 = t * t; + data[0] = -3.0f * fc * t2 + 4.0f * fc * t - fc; + data[1] = 3.0f * (2.0f - fc) * t2 + 2.0f * (fc - 3.0f) * t; + data[2] = 3.0f * (fc - 2.0f) * t2 + 2.0f * (3.0f - 2.0f * fc) * t + fc; + data[3] = 3.0f * fc * t2 - 2.0f * fc * t; + return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; +} + +ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3) +{ + float data[4]; + float fc = 0.71f; + float t2 = t * t; + float t3 = t2 * t; + data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t; + data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f; + data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t; + data[3] = fc * t3 - fc * t2; + return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; +} + +ccl_device_inline float3 curve_refine(KernelGlobals *kg, + ShaderData *sd, + const Intersection *isect, + const Ray *ray) +{ + int flag = kernel_data.curve.curveflags; + float t = isect->t; + float3 P = ray->P; + float3 D = ray->D; + + if(isect->object != OBJECT_NONE) { +#ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_itfm; +#else + Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); +#endif + + P = transform_point(&tfm, P); + D = transform_direction(&tfm, D*t); + D = normalize_len(D, &t); + } + + int prim = kernel_tex_fetch(__prim_index, isect->prim); + float4 v00 = kernel_tex_fetch(__curves, prim); + + int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); + int k1 = k0 + 1; + + float3 tg; + + if(flag & CURVE_KN_INTERPOLATE) { + int ka = max(k0 - 1,__float_as_int(v00.x)); + int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); + + float4 P_curve[4]; + + if(sd->type & PRIMITIVE_CURVE) { + P_curve[0] = kernel_tex_fetch(__curve_keys, ka); + P_curve[1] = kernel_tex_fetch(__curve_keys, k0); + P_curve[2] = kernel_tex_fetch(__curve_keys, k1); + P_curve[3] = kernel_tex_fetch(__curve_keys, kb); + } + else { + motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve); + } + + float3 p[4]; + p[0] = float4_to_float3(P_curve[0]); + p[1] = float4_to_float3(P_curve[1]); + p[2] = float4_to_float3(P_curve[2]); + p[3] = float4_to_float3(P_curve[3]); + + P = P + D*t; + +#ifdef __UV__ + sd->u = isect->u; + sd->v = 0.0f; +#endif + + tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); + + if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) { + sd->Ng = normalize(-(D - tg * (dot(tg, D)))); + } + else { + /* direction from inside to surface of curve */ + float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); + sd->Ng = normalize(P - p_curr); + + /* adjustment for changing radius */ + float gd = isect->v; + + if(gd != 0.0f) { + sd->Ng = sd->Ng - gd * tg; + sd->Ng = normalize(sd->Ng); + } + } + + /* todo: sometimes the normal is still so that this is detected as + * backfacing even if cull backfaces is enabled */ + + sd->N = sd->Ng; + } + else { + float4 P_curve[2]; + + if(sd->type & PRIMITIVE_CURVE) { + P_curve[0]= kernel_tex_fetch(__curve_keys, k0); + P_curve[1]= kernel_tex_fetch(__curve_keys, k1); + } + else { + motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); + } + + float l = 1.0f; + tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l); + + P = P + D*t; + + float3 dif = P - float4_to_float3(P_curve[0]); + +#ifdef __UV__ + sd->u = dot(dif,tg)/l; + sd->v = 0.0f; +#endif + + if(flag & CURVE_KN_TRUETANGENTGNORMAL) { + sd->Ng = -(D - tg * dot(tg, D)); + sd->Ng = normalize(sd->Ng); + } + else { + float gd = isect->v; + + /* direction from inside to surface of curve */ + sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd); + + /* adjustment for changing radius */ + if(gd != 0.0f) { + sd->Ng = sd->Ng - gd * tg; + sd->Ng = normalize(sd->Ng); + } + } + + sd->N = sd->Ng; + } + +#ifdef __DPDU__ + /* dPdu/dPdv */ + sd->dPdu = tg; + sd->dPdv = cross(tg, sd->Ng); +#endif + + if(isect->object != OBJECT_NONE) { +#ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_tfm; +#else + Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); +#endif + + P = transform_point(&tfm, P); + } + + return P; +} + +#endif + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h index 6ecdfe0173a..1ffc143be34 100644 --- a/intern/cycles/kernel/geom/geom_object.h +++ b/intern/cycles/kernel/geom/geom_object.h @@ -415,12 +415,7 @@ ccl_device_inline float3 bvh_clamp_direction(float3 dir) ccl_device_inline float3 bvh_inverse_direction(float3 dir) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 return rcp(dir); -#else - return 1.0f / dir; -#endif } /* Transform ray into object space to enter static object in BVH */ diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h index 1e0ef5201c9..698cd6b03fd 100644 --- a/intern/cycles/kernel/geom/geom_volume.h +++ b/intern/cycles/kernel/geom/geom_volume.h @@ -35,10 +35,10 @@ ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z) float4 r; switch(id) { case 0: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_000, x, y, z); break; - case 1: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_001, x, y, z); break; - case 2: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_002, x, y, z); break; - case 3: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_003, x, y, z); break; - case 4: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_004, x, y, z); break; + case 8: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_008, x, y, z); break; + case 16: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_016, x, y, z); break; + case 24: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_024, x, y, z); break; + case 32: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_032, x, y, z); break; } return r; } diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index 175bd6b9737..ae5f6e5e070 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -21,6 +21,9 @@ CCL_NAMESPACE_BEGIN * BSDF evaluation result, split per BSDF type. This is used to accumulate * render passes separately. */ +ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, + const ShaderData *sd); + ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 value, int use_light_pass) { #ifdef __PASSES__ @@ -178,7 +181,6 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) if(use_light_pass) { L->indirect = make_float3(0.0f, 0.0f, 0.0f); - L->direct_throughput = make_float3(0.0f, 0.0f, 0.0f); L->direct_emission = make_float3(0.0f, 0.0f, 0.0f); L->color_diffuse = make_float3(0.0f, 0.0f, 0.0f); @@ -199,57 +201,78 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f); L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f); - L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f); - L->path_glossy = make_float3(0.0f, 0.0f, 0.0f); - L->path_transmission = make_float3(0.0f, 0.0f, 0.0f); - L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f); - L->path_scatter = make_float3(0.0f, 0.0f, 0.0f); - + L->transparent = 0.0f; L->emission = make_float3(0.0f, 0.0f, 0.0f); L->background = make_float3(0.0f, 0.0f, 0.0f); L->ao = make_float3(0.0f, 0.0f, 0.0f); L->shadow = make_float4(0.0f, 0.0f, 0.0f, 0.0f); L->mist = 0.0f; + + L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f); + L->state.glossy = make_float3(0.0f, 0.0f, 0.0f); + L->state.transmission = make_float3(0.0f, 0.0f, 0.0f); + L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->state.scatter = make_float3(0.0f, 0.0f, 0.0f); + L->state.direct = make_float3(0.0f, 0.0f, 0.0f); } else #endif { + L->transparent = 0.0f; L->emission = make_float3(0.0f, 0.0f, 0.0f); } #ifdef __SHADOW_TRICKS__ L->path_total = make_float3(0.0f, 0.0f, 0.0f); L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f); - L->shadow_color = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_background_color = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_throughput = 0.0f; + L->shadow_transparency = 1.0f; + L->has_shadow_catcher = 0; #endif #ifdef __DENOISING_FEATURES__ L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f); L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f); L->denoising_depth = 0.0f; -#endif /* __DENOISING_FEATURES__ */ +#endif + +#ifdef __KERNEL_DEBUG__ + L->debug_data.num_bvh_traversed_nodes = 0; + L->debug_data.num_bvh_traversed_instances = 0; + L->debug_data.num_bvh_intersections = 0; + L->debug_data.num_ray_bounces = 0; +#endif } -ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput, - BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label) +ccl_device_inline void path_radiance_bsdf_bounce( + KernelGlobals *kg, + PathRadianceState *L_state, + ccl_addr_space float3 *throughput, + BsdfEval *bsdf_eval, + float bsdf_pdf, int bounce, int bsdf_label) { float inverse_pdf = 1.0f/bsdf_pdf; #ifdef __PASSES__ - if(L->use_light_pass) { + if(kernel_data.film.use_light_pass) { if(bounce == 0 && !(bsdf_label & LABEL_TRANSPARENT)) { /* first on directly visible surface */ float3 value = *throughput*inverse_pdf; - L->path_diffuse = bsdf_eval->diffuse*value; - L->path_glossy = bsdf_eval->glossy*value; - L->path_transmission = bsdf_eval->transmission*value; - L->path_subsurface = bsdf_eval->subsurface*value; - L->path_scatter = bsdf_eval->scatter*value; - - *throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface + L->path_scatter; + L_state->diffuse = bsdf_eval->diffuse*value; + L_state->glossy = bsdf_eval->glossy*value; + L_state->transmission = bsdf_eval->transmission*value; + L_state->subsurface = bsdf_eval->subsurface*value; + L_state->scatter = bsdf_eval->scatter*value; + + *throughput = L_state->diffuse + + L_state->glossy + + L_state->transmission + + L_state->subsurface + + L_state->scatter; - L->direct_throughput = *throughput; + L_state->direct = *throughput; } else { /* transparent bounce before first hit, or indirectly visible through BSDF */ @@ -264,13 +287,22 @@ ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space } } -ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 throughput, float3 value, int bounce) +ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + float3 value) { +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + return; + } +#endif + #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) + if(state->bounce == 0) L->emission += throughput*value; - else if(bounce == 1) + else if(state->bounce == 1) L->direct_emission += throughput*value; else L->indirect += throughput*value; @@ -289,6 +321,18 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 bsdf, float3 ao) { +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + float3 light = throughput * bsdf; + L->path_total += light; + L->path_total_shaded += ao * light; + + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + return; + } + } +#endif + #ifdef __PASSES__ if(L->use_light_pass) { if(state->bounce == 0) { @@ -306,14 +350,6 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, { L->emission += throughput*bsdf*ao; } - -#ifdef __SHADOW_TRICKS__ - if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { - float3 light = throughput * bsdf; - L->path_total += light; - L->path_total_shaded += ao * light; - } -#endif } ccl_device_inline void path_radiance_accum_total_ao( @@ -342,6 +378,18 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float shadow_fac, bool is_lamp) { +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + float3 light = throughput * bsdf_eval->sum_no_mis; + L->path_total += light; + L->path_total_shaded += shadow * light; + + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + return; + } + } +#endif + #ifdef __PASSES__ if(L->use_light_pass) { if(state->bounce == 0) { @@ -368,14 +416,6 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, { L->emission += throughput*bsdf_eval->diffuse*shadow; } - -#ifdef __SHADOW_TRICKS__ - if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { - float3 light = throughput * bsdf_eval->sum_no_mis; - L->path_total += light; - L->path_total_shaded += shadow * light; - } -#endif } ccl_device_inline void path_radiance_accum_total_light( @@ -396,11 +436,24 @@ ccl_device_inline void path_radiance_accum_total_light( #endif } -ccl_device_inline void path_radiance_accum_background(PathRadiance *L, - ccl_addr_space PathState *state, - float3 throughput, - float3 value) +ccl_device_inline void path_radiance_accum_background( + PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + float3 value) { + +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * value; + L->path_total_shaded += throughput * value * L->shadow_transparency; + + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + return; + } + } +#endif + #ifdef __PASSES__ if(L->use_light_pass) { if(state->bounce == 0) @@ -416,20 +469,31 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L, L->emission += throughput*value; } -#ifdef __SHADOW_TRICKS__ - if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { - L->path_total += throughput * value; - if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) { - L->path_total_shaded += throughput * value; - } - } -#endif - #ifdef __DENOISING_FEATURES__ L->denoising_albedo += state->denoising_feature_weight * value; #endif /* __DENOISING_FEATURES__ */ } +ccl_device_inline void path_radiance_accum_transparent( + PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput) +{ + L->transparent += average(throughput); +} + +#ifdef __SHADOW_TRICKS__ +ccl_device_inline void path_radiance_accum_shadowcatcher( + PathRadiance *L, + float3 throughput, + float3 background) +{ + L->shadow_throughput += average(throughput); + L->shadow_background_color += throughput * background; + L->has_shadow_catcher = 1; +} +#endif + ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L) { #ifdef __PASSES__ @@ -437,19 +501,19 @@ ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L) * only a single throughput further along the path, here we recover just * the indirect path that is not influenced by any particular BSDF type */ if(L->use_light_pass) { - L->direct_emission = safe_divide_color(L->direct_emission, L->direct_throughput); - L->direct_diffuse += L->path_diffuse*L->direct_emission; - L->direct_glossy += L->path_glossy*L->direct_emission; - L->direct_transmission += L->path_transmission*L->direct_emission; - L->direct_subsurface += L->path_subsurface*L->direct_emission; - L->direct_scatter += L->path_scatter*L->direct_emission; - - L->indirect = safe_divide_color(L->indirect, L->direct_throughput); - L->indirect_diffuse += L->path_diffuse*L->indirect; - L->indirect_glossy += L->path_glossy*L->indirect; - L->indirect_transmission += L->path_transmission*L->indirect; - L->indirect_subsurface += L->path_subsurface*L->indirect; - L->indirect_scatter += L->path_scatter*L->indirect; + L->direct_emission = safe_divide_color(L->direct_emission, L->state.direct); + L->direct_diffuse += L->state.diffuse*L->direct_emission; + L->direct_glossy += L->state.glossy*L->direct_emission; + L->direct_transmission += L->state.transmission*L->direct_emission; + L->direct_subsurface += L->state.subsurface*L->direct_emission; + L->direct_scatter += L->state.scatter*L->direct_emission; + + L->indirect = safe_divide_color(L->indirect, L->state.direct); + L->indirect_diffuse += L->state.diffuse*L->indirect; + L->indirect_glossy += L->state.glossy*L->indirect; + L->indirect_transmission += L->state.transmission*L->indirect; + L->indirect_subsurface += L->state.subsurface*L->indirect; + L->indirect_scatter += L->state.scatter*L->indirect; } #endif } @@ -458,11 +522,11 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L) { #ifdef __PASSES__ if(L->use_light_pass) { - L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f); - L->path_glossy = make_float3(0.0f, 0.0f, 0.0f); - L->path_transmission = make_float3(0.0f, 0.0f, 0.0f); - L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f); - L->path_scatter = make_float3(0.0f, 0.0f, 0.0f); + L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f); + L->state.glossy = make_float3(0.0f, 0.0f, 0.0f); + L->state.transmission = make_float3(0.0f, 0.0f, 0.0f); + L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->state.scatter = make_float3(0.0f, 0.0f, 0.0f); L->direct_emission = make_float3(0.0f, 0.0f, 0.0f); L->indirect = make_float3(0.0f, 0.0f, 0.0f); @@ -475,11 +539,7 @@ ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L, { #ifdef __PASSES__ if(L->use_light_pass) { - L->path_diffuse = L_src->path_diffuse; - L->path_glossy = L_src->path_glossy; - L->path_transmission = L_src->path_transmission; - L->path_subsurface = L_src->path_subsurface; - L->path_scatter = L_src->path_scatter; + L->state = L_src->state; L->direct_emission = L_src->direct_emission; L->indirect = L_src->indirect; @@ -487,7 +547,36 @@ ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L, #endif } -ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L) +#ifdef __SHADOW_TRICKS__ +ccl_device_inline void path_radiance_sum_shadowcatcher(KernelGlobals *kg, + PathRadiance *L, + float3 *L_sum, + float *alpha) +{ + /* Calculate current shadow of the path. */ + float path_total = average(L->path_total); + float shadow; + + if(path_total == 0.0f) { + shadow = L->shadow_transparency; + } + else { + float path_total_shaded = average(L->path_total_shaded); + shadow = path_total_shaded / path_total; + } + + /* Calculate final light sum and transparency for shadow catcher object. */ + if(kernel_data.background.transparent) { + *alpha -= L->shadow_throughput * shadow; + } + else { + L->shadow_background_color *= shadow; + *L_sum += L->shadow_background_color; + } +} +#endif + +ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L, float *alpha) { float3 L_sum; /* Light Passes are used */ @@ -564,8 +653,6 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi L_sum = L_direct + L_indirect; } #endif - - return L_sum; } /* No Light Passes */ @@ -573,14 +660,24 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi #endif { L_sum = L->emission; + + /* Reject invalid value */ + float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z); + if(!isfinite_safe(sum)) { + kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!"); + L_sum = make_float3(0.0f, 0.0f, 0.0f); + } } - /* Reject invalid value */ - float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z); - if(!isfinite_safe(sum)) { - kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!"); - L_sum = make_float3(0.0f, 0.0f, 0.0f); + /* Compute alpha. */ + *alpha = 1.0f - L->transparent; + + /* Add shadow catcher contributions. */ +#ifdef __SHADOW_TRICKS__ + if(L->has_shadow_catcher) { + path_radiance_sum_shadowcatcher(kg, L, &L_sum, alpha); } +#endif /* __SHADOW_TRICKS__ */ return L_sum; } @@ -613,14 +710,18 @@ ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadi *clean = make_float3(0.0f, 0.0f, 0.0f); #endif +#ifdef __SHADOW_TRICKS__ + if(L->has_shadow_catcher) { + *noisy += L->shadow_background_color; + } +#endif + *noisy = ensure_finite3(*noisy); *clean = ensure_finite3(*clean); } -ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples) +ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample) { - float fac = 1.0f/num_samples; - #ifdef __SPLIT_KERNEL__ # define safe_float3_add(f, v) \ do { \ @@ -629,65 +730,35 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance atomic_add_and_fetch_float(p+1, (v).y); \ atomic_add_and_fetch_float(p+2, (v).z); \ } while(0) +# define safe_float_add(f, v) \ + atomic_add_and_fetch_float(&(f), (v)) #else # define safe_float3_add(f, v) (f) += (v) +# define safe_float_add(f, v) (f) += (v) #endif /* __SPLIT_KERNEL__ */ #ifdef __PASSES__ - safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse*fac); - safe_float3_add(L->direct_glossy, L_sample->direct_glossy*fac); - safe_float3_add(L->direct_transmission, L_sample->direct_transmission*fac); - safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface*fac); - safe_float3_add(L->direct_scatter, L_sample->direct_scatter*fac); - - safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse*fac); - safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy*fac); - safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission*fac); - safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface*fac); - safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter*fac); - - safe_float3_add(L->background, L_sample->background*fac); - safe_float3_add(L->ao, L_sample->ao*fac); - safe_float3_add(L->shadow, L_sample->shadow*fac); -# ifdef __SPLIT_KERNEL__ - atomic_add_and_fetch_float(&L->mist, L_sample->mist*fac); -# else - L->mist += L_sample->mist*fac; -# endif /* __SPLIT_KERNEL__ */ + safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse); + safe_float3_add(L->direct_glossy, L_sample->direct_glossy); + safe_float3_add(L->direct_transmission, L_sample->direct_transmission); + safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface); + safe_float3_add(L->direct_scatter, L_sample->direct_scatter); + + safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse); + safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy); + safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission); + safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface); + safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter); + + safe_float3_add(L->background, L_sample->background); + safe_float3_add(L->ao, L_sample->ao); + safe_float3_add(L->shadow, L_sample->shadow); + safe_float_add(L->mist, L_sample->mist); #endif /* __PASSES__ */ - safe_float3_add(L->emission, L_sample->emission*fac); + safe_float3_add(L->emission, L_sample->emission); +#undef safe_float_add #undef safe_float3_add } -#ifdef __SHADOW_TRICKS__ -/* Calculate current shadow of the path. */ -ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L) -{ - float path_total = average(L->path_total); - float path_total_shaded = average(L->path_total_shaded); - if(path_total != 0.0f) { - return path_total_shaded / path_total; - } - return 1.0f; -} - -/* Calculate final light sum and transparency for shadow catcher object. */ -ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg, - const PathRadiance *L, - float* alpha) -{ - const float shadow = path_radiance_sum_shadow(L); - float3 L_sum; - if(kernel_data.background.transparent) { - *alpha = 1.0f-shadow; - L_sum = make_float3(0.0f, 0.0f, 0.0f); - } - else { - L_sum = L->shadow_color * shadow; - } - return L_sum; -} -#endif - CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index f18d145f7cf..4d89839c46c 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, - RNG rng, + uint rng_hash, int pass_filter, int sample) { @@ -48,11 +48,10 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, path_radiance_init(&L_sample, kernel_data.film.use_light_pass); /* init path state */ - path_state_init(kg, &emission_sd, &state, &rng, sample, NULL); + path_state_init(kg, &emission_sd, &state, rng_hash, sample, NULL); /* evaluate surface shader */ - float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF); - shader_eval_surface(kg, sd, &rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, sd, &state, state.flag); /* TODO, disable more closures we don't need besides transparent */ shader_bsdf_disable_transparency(kg, sd); @@ -64,13 +63,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* sample ambient occlusion */ if(pass_filter & BAKE_FILTER_AO) { - kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput, shader_bsdf_alpha(kg, sd)); + kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput, shader_bsdf_alpha(kg, sd)); } /* sample emission */ if((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) { float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce); + path_radiance_accum_emission(&L_sample, &state, throughput, emission); } bool is_sss_sample = false; @@ -86,7 +85,6 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, &emission_sd, &L_sample, &state, - &rng, &ray, &throughput, &ss_indirect)) @@ -101,13 +99,10 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, kernel_path_indirect(kg, &indirect_sd, &emission_sd, - &rng, &ray, throughput, - state.num_samples, &state, &L_sample); - kernel_path_subsurface_accum_indirect(&ss_indirect, &L_sample); } is_sss_sample = true; } @@ -116,14 +111,14 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* sample light and BSDF */ if(!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) { - kernel_path_surface_connect_light(kg, &rng, sd, &emission_sd, throughput, &state, &L_sample); + kernel_path_surface_connect_light(kg, sd, &emission_sd, throughput, &state, &L_sample); - if(kernel_path_surface_bounce(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) { + if(kernel_path_surface_bounce(kg, sd, &throughput, &state, &L_sample.state, &ray)) { #ifdef __LAMP_MIS__ state.ray_t = 0.0f; #endif /* compute indirect light */ - kernel_path_indirect(kg, &indirect_sd, &emission_sd, &rng, &ray, throughput, 1, &state, &L_sample); + kernel_path_indirect(kg, &indirect_sd, &emission_sd, &ray, throughput, &state, &L_sample); /* sum and reset indirect light pass variables for the next samples */ path_radiance_sum_indirect(&L_sample); @@ -137,13 +132,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* sample ambient occlusion */ if(pass_filter & BAKE_FILTER_AO) { - kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput); + kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput); } /* sample emission */ if((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) { float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce); + path_radiance_accum_emission(&L_sample, &state, throughput, emission); } #ifdef __SUBSURFACE__ @@ -151,7 +146,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, if((pass_filter & BAKE_FILTER_SUBSURFACE) && (sd->flag & SD_BSSRDF)) { /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */ kernel_branched_path_subsurface_scatter(kg, sd, &indirect_sd, - &emission_sd, &L_sample, &state, &rng, &ray, throughput); + &emission_sd, &L_sample, &state, &ray, throughput); } #endif @@ -161,20 +156,20 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* direct light */ if(kernel_data.integrator.use_direct_light) { int all = kernel_data.integrator.sample_all_lights_direct; - kernel_branched_path_surface_connect_light(kg, &rng, + kernel_branched_path_surface_connect_light(kg, sd, &emission_sd, &state, throughput, 1.0f, &L_sample, all); } #endif /* indirect light */ - kernel_branched_path_surface_indirect_light(kg, &rng, + kernel_branched_path_surface_indirect_light(kg, sd, &indirect_sd, &emission_sd, throughput, 1.0f, &state, &L_sample); } } #endif /* accumulate into master L */ - path_radiance_accum_sample(L, &L_sample, 1); + path_radiance_accum_sample(L, &L_sample); } ccl_device bool is_aa_pass(ShaderEvalType type) @@ -225,7 +220,6 @@ ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg, ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg, ShaderData *sd, - RNG *rng, PathState *state, float3 direct, float3 indirect, @@ -245,12 +239,12 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg, } else { /* surface color of the pass only */ - shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, sd, state, 0); return kernel_bake_shader_bsdf(kg, sd, type); } } else { - shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, sd, state, 0); color = kernel_bake_shader_bsdf(kg, sd, type); } @@ -292,14 +286,14 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, int num_samples = kernel_data.integrator.aa_samples; /* random number generator */ - RNG rng = cmj_hash(offset + i, kernel_data.integrator.seed); + uint rng_hash = cmj_hash(offset + i, kernel_data.integrator.seed); float filter_x, filter_y; if(sample == 0) { filter_x = filter_y = 0.5f; } else { - path_rng_2D(kg, &rng, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y); + path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y); } /* subpixel u/v offset */ @@ -335,18 +329,18 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, /* light passes if we need more than color */ if(pass_filter & ~BAKE_FILTER_COLOR) - compute_light_pass(kg, &sd, &L, rng, pass_filter, sample); + compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample); switch(type) { /* data passes */ case SHADER_EVAL_NORMAL: { if((sd.flag & SD_HAS_BUMP)) { - shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, &sd, &state, 0); } - /* compression: normal = (2 * color) - 1 */ - out = sd.N * 0.5f + make_float3(0.5f, 0.5f, 0.5f); + /* encoding: normal = (2 * color) - 1 */ + out = shader_bsdf_average_normal(kg, &sd) * 0.5f + make_float3(0.5f, 0.5f, 0.5f); break; } case SHADER_EVAL_UV: @@ -356,7 +350,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, } case SHADER_EVAL_EMISSION: { - shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_EMISSION); + shader_eval_surface(kg, &sd, &state, 0); out = shader_emissive_eval(kg, &sd); break; } @@ -371,7 +365,8 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, case SHADER_EVAL_COMBINED: { if((pass_filter & BAKE_FILTER_COMBINED) == BAKE_FILTER_COMBINED) { - out = path_radiance_clamp_and_sum(kg, &L); + float alpha; + out = path_radiance_clamp_and_sum(kg, &L, &alpha); break; } @@ -409,7 +404,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, { out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_diffuse, L.indirect_diffuse, @@ -421,7 +415,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, { out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_glossy, L.indirect_glossy, @@ -433,7 +426,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, { out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_transmission, L.indirect_transmission, @@ -446,7 +438,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, #ifdef __SUBSURFACE__ out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_subsurface, L.indirect_subsurface, @@ -480,7 +471,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, /* evaluate */ int flag = 0; /* we can't know which type of BSDF this is for */ - out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN); + out = shader_eval_background(kg, &sd, &state, flag); break; } default: @@ -524,7 +515,7 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, /* evaluate */ float3 P = sd.P; - shader_eval_displacement(kg, &sd, &state, SHADER_CONTEXT_MAIN); + shader_eval_displacement(kg, &sd, &state); out = sd.P - P; object_inverse_dir_transform(kg, &sd, &out); @@ -552,7 +543,7 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, /* evaluate */ int flag = 0; /* we can't know which type of BSDF this is for */ - out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN); + out = shader_eval_background(kg, &sd, &state, flag); } /* write output */ diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index 38708f7ff0b..1e2af9de8b3 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -38,11 +38,15 @@ /* Qualifier wrappers for different names on different devices */ #define ccl_device __device__ __inline__ +#if __CUDA_ARCH__ < 300 +# define ccl_device_inline __device__ __inline__ # define ccl_device_forceinline __device__ __forceinline__ -#if __CUDA_ARCH__ < 500 +#elif __CUDA_ARCH__ < 500 # define ccl_device_inline __device__ __forceinline__ +# define ccl_device_forceinline __device__ __forceinline__ #else # define ccl_device_inline __device__ __inline__ +# define ccl_device_forceinline __device__ __forceinline__ #endif #define ccl_device_noinline __device__ __noinline__ #define ccl_global @@ -53,6 +57,10 @@ #define ccl_may_alias #define ccl_addr_space #define ccl_restrict __restrict__ +/* TODO(sergey): In theory we might use references with CUDA, however + * performance impact yet to be investigated. + */ +#define ccl_ref #define ccl_align(n) __align__(n) #define ATTR_FALLTHROUGH diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index 4836c290312..36d6031d042 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -42,6 +42,7 @@ #define ccl_local_param __local #define ccl_private __private #define ccl_restrict restrict +#define ccl_ref #define ccl_align(n) __attribute__((aligned(n))) #ifdef __SPLIT_KERNEL__ @@ -129,6 +130,7 @@ # define expf(x) native_exp(((float)(x))) # define sqrtf(x) native_sqrt(((float)(x))) # define logf(x) native_log(((float)(x))) +# define rcp(x) native_recip(x) #else # define sinf(x) sin(((float)(x))) # define cosf(x) cos(((float)(x))) @@ -136,11 +138,12 @@ # define expf(x) exp(((float)(x))) # define sqrtf(x) sqrt(((float)(x))) # define logf(x) log(((float)(x))) +# define rcp(x) recip(x)) #endif /* data lookup defines */ #define kernel_data (*kg->data) -#define kernel_tex_fetch(t, index) kg->t[index] +#define kernel_tex_fetch(tex, index) ((ccl_global tex##_t*)(kg->buffers[kg->tex.buffer] + kg->tex.offset))[(index)] /* define NULL */ #define NULL 0 diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h deleted file mode 100644 index 5647bbae5b5..00000000000 --- a/intern/cycles/kernel/kernel_debug.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright 2011-2014 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device_inline void debug_data_init(DebugData *debug_data) -{ - debug_data->num_bvh_traversed_nodes = 0; - debug_data->num_bvh_traversed_instances = 0; - debug_data->num_bvh_intersections = 0; - debug_data->num_ray_bounces = 0; -} - -ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg, - ccl_global float *buffer, - ccl_addr_space PathState *state, - DebugData *debug_data, - int sample) -{ - int flag = kernel_data.film.pass_flag; - if(flag & PASS_BVH_TRAVERSED_NODES) { - kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_nodes, - sample, - debug_data->num_bvh_traversed_nodes); - } - if(flag & PASS_BVH_TRAVERSED_INSTANCES) { - kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances, - sample, - debug_data->num_bvh_traversed_instances); - } - if(flag & PASS_BVH_INTERSECTIONS) { - kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_intersections, - sample, - debug_data->num_bvh_intersections); - } - if(flag & PASS_RAY_BOUNCES) { - kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces, - sample, - debug_data->num_ray_bounces); - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index 9e7d51f23f5..45b8c6311e1 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -37,16 +37,14 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, ray.D = ls->D; ray.P = ls->P; ray.t = 1.0f; -# ifdef __OBJECT_MOTION__ ray.time = time; -# endif ray.dP = differential3_zero(); ray.dD = dI; shader_setup_from_background(kg, emission_sd, &ray); path_state_modify_bounce(state, true); - eval = shader_eval_background(kg, emission_sd, state, 0, SHADER_CONTEXT_EMISSION); + eval = shader_eval_background(kg, emission_sd, state, 0); path_state_modify_bounce(state, false); } else @@ -72,7 +70,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, /* no path flag, we're evaluating this for all closures. that's weak but * we'd have to do multiple evaluations otherwise */ path_state_modify_bounce(state, true); - shader_eval_surface(kg, emission_sd, NULL, state, 0.0f, 0, SHADER_CONTEXT_EMISSION); + shader_eval_surface(kg, emission_sd, state, 0); path_state_modify_bounce(state, false); /* evaluate emissive closure */ @@ -216,7 +214,7 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader { /* multiple importance sampling, get triangle light pdf, * and compute weight with respect to BSDF pdf */ - float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t); + float pdf = triangle_light_pdf(kg, sd, t); float mis_weight = power_heuristic(bsdf_pdf, pdf); return L*mis_weight; @@ -319,7 +317,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, # endif path_state_modify_bounce(state, true); - float3 L = shader_eval_background(kg, emission_sd, state, state->flag, SHADER_CONTEXT_EMISSION); + float3 L = shader_eval_background(kg, emission_sd, state, state->flag); path_state_modify_bounce(state, false); #ifdef __BACKGROUND_MIS__ diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index f95f0d98c52..9d55183d94b 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -23,6 +23,10 @@ # include "util/util_vector.h" #endif +#ifdef __KERNEL_OPENCL__ +# include "util/util_atomic.h" +#endif + CCL_NAMESPACE_BEGIN /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in @@ -109,11 +113,22 @@ typedef struct KernelGlobals { #ifdef __KERNEL_OPENCL__ +# define KERNEL_TEX(type, ttype, name) \ +typedef type name##_t; +# include "kernel/kernel_textures.h" + +typedef struct tex_info_t { + uint buffer, padding; + uint64_t offset; + uint width, height, depth, options; +} tex_info_t; + typedef ccl_addr_space struct KernelGlobals { ccl_constant KernelData *data; + ccl_global char *buffers[8]; # define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name; + tex_info_t name; # include "kernel/kernel_textures.h" # ifdef __SPLIT_KERNEL__ @@ -122,6 +137,57 @@ typedef ccl_addr_space struct KernelGlobals { # endif } KernelGlobals; +#define KERNEL_BUFFER_PARAMS \ + ccl_global char *buffer0, \ + ccl_global char *buffer1, \ + ccl_global char *buffer2, \ + ccl_global char *buffer3, \ + ccl_global char *buffer4, \ + ccl_global char *buffer5, \ + ccl_global char *buffer6, \ + ccl_global char *buffer7 + +#define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7 + +ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS) +{ +#ifdef __SPLIT_KERNEL__ + if(ccl_local_id(0) + ccl_local_id(1) == 0) +#endif + { + kg->buffers[0] = buffer0; + kg->buffers[1] = buffer1; + kg->buffers[2] = buffer2; + kg->buffers[3] = buffer3; + kg->buffers[4] = buffer4; + kg->buffers[5] = buffer5; + kg->buffers[6] = buffer6; + kg->buffers[7] = buffer7; + } + +# ifdef __SPLIT_KERNEL__ + ccl_barrier(CCL_LOCAL_MEM_FENCE); +# endif +} + +ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg) +{ +# ifdef __SPLIT_KERNEL__ + if(ccl_local_id(0) + ccl_local_id(1) == 0) +# endif + { + ccl_global tex_info_t *info = (ccl_global tex_info_t*)kg->buffers[0]; + +# define KERNEL_TEX(type, ttype, name) \ + kg->name = *(info++); +# include "kernel/kernel_textures.h" + } + +# ifdef __SPLIT_KERNEL__ + ccl_barrier(CCL_LOCAL_MEM_FENCE); +# endif +} + #endif /* __KERNEL_OPENCL__ */ /* Interpolated lookup table access */ diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h index 90747e09357..9e3373432ec 100644 --- a/intern/cycles/kernel/kernel_image_opencl.h +++ b/intern/cycles/kernel/kernel_image_opencl.h @@ -15,30 +15,42 @@ */ -/* For OpenCL all images are packed in a single array, and we do manual lookup - * and interpolation. */ +/* For OpenCL we do manual lookup and interpolation. */ + +ccl_device_inline ccl_global tex_info_t* kernel_tex_info(KernelGlobals *kg, uint id) { + const uint tex_offset = id +#define KERNEL_TEX(type, ttype, name) + 1 +#include "kernel/kernel_textures.h" + ; + + return &((ccl_global tex_info_t*)kg->buffers[0])[tex_offset]; +} + +#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->buffer] + info->offset))[(index)] ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset) { + const ccl_global tex_info_t *info = kernel_tex_info(kg, id); const int texture_type = kernel_tex_type(id); + /* Float4 */ if(texture_type == IMAGE_DATA_TYPE_FLOAT4) { - return kernel_tex_fetch(__tex_image_float4_packed, offset); + return tex_fetch(float4, info, offset); } /* Byte4 */ else if(texture_type == IMAGE_DATA_TYPE_BYTE4) { - uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset); + uchar4 r = tex_fetch(uchar4, info, offset); float f = 1.0f/255.0f; return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); } /* Float */ else if(texture_type == IMAGE_DATA_TYPE_FLOAT) { - float f = kernel_tex_fetch(__tex_image_float_packed, offset); + float f = tex_fetch(float, info, offset); return make_float4(f, f, f, 1.0f); } /* Byte */ else { - uchar r = kernel_tex_fetch(__tex_image_byte_packed, offset); + uchar r = tex_fetch(uchar, info, offset); float f = r * (1.0f/255.0f); return make_float4(f, f, f, 1.0f); } @@ -64,17 +76,17 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix) return x - (float)i; } -ccl_device_inline uint kernel_decode_image_interpolation(uint4 info) +ccl_device_inline uint kernel_decode_image_interpolation(uint info) { - return (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; + return (info & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; } -ccl_device_inline uint kernel_decode_image_extension(uint4 info) +ccl_device_inline uint kernel_decode_image_extension(uint info) { - if(info.w & (1 << 1)) { + if(info & (1 << 1)) { return EXTENSION_REPEAT; } - else if(info.w & (1 << 2)) { + else if(info & (1 << 2)) { return EXTENSION_EXTEND; } else { @@ -84,13 +96,16 @@ ccl_device_inline uint kernel_decode_image_extension(uint4 info) ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) { - uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2); - uint width = info.x; - uint height = info.y; - uint offset = info.z; + const ccl_global tex_info_t *info = kernel_tex_info(kg, id); + + uint width = info->width; + uint height = info->height; + uint offset = 0; + /* Decode image options. */ - uint interpolation = kernel_decode_image_interpolation(info); - uint extension = kernel_decode_image_extension(info); + uint interpolation = kernel_decode_image_interpolation(info->options); + uint extension = kernel_decode_image_extension(info->options); + /* Actual sampling. */ float4 r; int ix, iy, nix, niy; @@ -150,14 +165,17 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z) { - uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2); - uint width = info.x; - uint height = info.y; - uint offset = info.z; - uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x; + const ccl_global tex_info_t *info = kernel_tex_info(kg, id); + + uint width = info->width; + uint height = info->height; + uint offset = 0; + uint depth = info->depth; + /* Decode image options. */ - uint interpolation = kernel_decode_image_interpolation(info); - uint extension = kernel_decode_image_extension(info); + uint interpolation = kernel_decode_image_interpolation(info->options); + uint extension = kernel_decode_image_extension(info->options); + /* Actual sampling. */ float4 r; int ix, iy, iz, nix, niy, niz; diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index 9baa9d54957..c806deee8e7 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -396,11 +396,13 @@ ccl_device_inline float3 background_light_sample(KernelGlobals *kg, + (1.0f - portal_sampling_pdf) * cdf_pdf); } return D; - } else { + } + else { /* Sample map, but with nonzero portal_sampling_pdf for MIS. */ randu = (randu - portal_sampling_pdf) / (1.0f - portal_sampling_pdf); } - } else { + } + else { /* We can't sample a portal. * Check if we can sample the map instead. */ @@ -763,78 +765,280 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, /* Triangle Light */ -ccl_device void object_transform_light_sample(KernelGlobals *kg, LightSample *ls, int object, float time) +/* returns true if the triangle is has motion blur or an instancing transform applied */ +ccl_device_inline bool triangle_world_space_vertices(KernelGlobals *kg, int object, int prim, float time, float3 V[3]) { + bool has_motion = false; + const int object_flag = kernel_tex_fetch(__object_flag, object); + + if(object_flag & SD_OBJECT_HAS_VERTEX_MOTION && time >= 0.0f) { + motion_triangle_vertices(kg, object, prim, time, V); + has_motion = true; + } + else { + triangle_vertices(kg, prim, V); + } + #ifdef __INSTANCING__ - /* instance transform */ - if(!(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED)) { + if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { # ifdef __OBJECT_MOTION__ - Transform itfm; - Transform tfm = object_fetch_transform_motion_test(kg, object, time, &itfm); + Transform tfm = object_fetch_transform_motion_test(kg, object, time, NULL); # else Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); # endif - - ls->P = transform_point(&tfm, ls->P); - ls->Ng = normalize(transform_direction(&tfm, ls->Ng)); + V[0] = transform_point(&tfm, V[0]); + V[1] = transform_point(&tfm, V[1]); + V[2] = transform_point(&tfm, V[2]); + has_motion = true; } #endif + return has_motion; } -ccl_device void triangle_light_sample(KernelGlobals *kg, int prim, int object, - float randu, float randv, float time, LightSample *ls) +ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg, const float3 Ng, const float3 I, float t) { - float u, v; + float pdf = kernel_data.integrator.pdf_triangles; + float cos_pi = fabsf(dot(Ng, I)); - /* compute random point in triangle */ - randu = sqrtf(randu); + if(cos_pi == 0.0f) + return 0.0f; + + return t*t*pdf/cos_pi; +} - u = 1.0f - randu; - v = randv*randu; +ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *sd, float t) +{ + /* A naive heuristic to decide between costly solid angle sampling + * and simple area sampling, comparing the distance to the triangle plane + * to the length of the edges of the triangle. */ + + float3 V[3]; + bool has_motion = triangle_world_space_vertices(kg, sd->object, sd->prim, sd->time, V); + + const float3 e0 = V[1] - V[0]; + const float3 e1 = V[2] - V[0]; + const float3 e2 = V[2] - V[1]; + const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2))); + const float3 N = cross(e0, e1); + const float distance_to_plane = fabsf(dot(N, sd->I * t))/dot(N, N); + + if(longest_edge_squared > distance_to_plane*distance_to_plane) { + /* sd contains the point on the light source + * calculate Px, the point that we're shading */ + const float3 Px = sd->P + sd->I * t; + const float3 v0_p = V[0] - Px; + const float3 v1_p = V[1] - Px; + const float3 v2_p = V[2] - Px; + + const float3 u01 = safe_normalize(cross(v0_p, v1_p)); + const float3 u02 = safe_normalize(cross(v0_p, v2_p)); + const float3 u12 = safe_normalize(cross(v1_p, v2_p)); + + const float alpha = fast_acosf(dot(u02, u01)); + const float beta = fast_acosf(-dot(u01, u12)); + const float gamma = fast_acosf(dot(u02, u12)); + const float solid_angle = alpha + beta + gamma - M_PI_F; + + /* pdf_triangles is calculated over triangle area, but we're not sampling over its area */ + if(UNLIKELY(solid_angle == 0.0f)) { + return 0.0f; + } + else { + float area = 1.0f; + if(has_motion) { + /* get the center frame vertices, this is what the PDF was calculated from */ + triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V); + area = triangle_area(V[0], V[1], V[2]); + } + else { + area = 0.5f * len(N); + } + const float pdf = area * kernel_data.integrator.pdf_triangles; + return pdf / solid_angle; + } + } + else { + float pdf = triangle_light_pdf_area(kg, sd->Ng, sd->I, t); + if(has_motion) { + const float area = 0.5f * len(N); + if(UNLIKELY(area == 0.0f)) { + return 0.0f; + } + /* scale the PDF. + * area = the area the sample was taken from + * area_pre = the are from which pdf_triangles was calculated from */ + triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V); + const float area_pre = triangle_area(V[0], V[1], V[2]); + pdf = pdf * area_pre / area; + } + return pdf; + } +} - /* triangle, so get position, normal, shader */ - triangle_point_normal(kg, object, prim, u, v, &ls->P, &ls->Ng, &ls->shader); +ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg, int prim, int object, + float randu, float randv, float time, LightSample *ls, const float3 P) +{ + /* A naive heuristic to decide between costly solid angle sampling + * and simple area sampling, comparing the distance to the triangle plane + * to the length of the edges of the triangle. */ + + float3 V[3]; + bool has_motion = triangle_world_space_vertices(kg, object, prim, time, V); + + const float3 e0 = V[1] - V[0]; + const float3 e1 = V[2] - V[0]; + const float3 e2 = V[2] - V[1]; + const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2))); + const float3 N0 = cross(e0, e1); + float Nl = 0.0f; + ls->Ng = safe_normalize_len(N0, &Nl); + float area = 0.5f * Nl; + + /* flip normal if necessary */ + const int object_flag = kernel_tex_fetch(__object_flag, object); + if(object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { + ls->Ng = -ls->Ng; + } + ls->eval_fac = 1.0f; + ls->shader = kernel_tex_fetch(__tri_shader, prim); ls->object = object; ls->prim = prim; ls->lamp = LAMP_NONE; ls->shader |= SHADER_USE_MIS; - ls->t = 0.0f; - ls->u = u; - ls->v = v; ls->type = LIGHT_TRIANGLE; - ls->eval_fac = 1.0f; - object_transform_light_sample(kg, ls, object, time); -} + float distance_to_plane = fabsf(dot(N0, V[0] - P)/dot(N0, N0)); + + if(longest_edge_squared > distance_to_plane*distance_to_plane) { + /* see James Arvo, "Stratified Sampling of Spherical Triangles" + * http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */ + + /* project the triangle to the unit sphere + * and calculate its edges and angles */ + const float3 v0_p = V[0] - P; + const float3 v1_p = V[1] - P; + const float3 v2_p = V[2] - P; + + const float3 u01 = safe_normalize(cross(v0_p, v1_p)); + const float3 u02 = safe_normalize(cross(v0_p, v2_p)); + const float3 u12 = safe_normalize(cross(v1_p, v2_p)); + + const float3 A = safe_normalize(v0_p); + const float3 B = safe_normalize(v1_p); + const float3 C = safe_normalize(v2_p); + + const float cos_alpha = dot(u02, u01); + const float cos_beta = -dot(u01, u12); + const float cos_gamma = dot(u02, u12); + + /* calculate dihedral angles */ + const float alpha = fast_acosf(cos_alpha); + const float beta = fast_acosf(cos_beta); + const float gamma = fast_acosf(cos_gamma); + /* the area of the unit spherical triangle = solid angle */ + const float solid_angle = alpha + beta + gamma - M_PI_F; + + /* precompute a few things + * these could be re-used to take several samples + * as they are independent of randu/randv */ + const float cos_c = dot(A, B); + const float sin_alpha = fast_sinf(alpha); + const float product = sin_alpha * cos_c; + + /* Select a random sub-area of the spherical triangle + * and calculate the third vertex C_ of that new triangle */ + const float phi = randu * solid_angle - alpha; + float s, t; + fast_sincosf(phi, &s, &t); + const float u = t - cos_alpha; + const float v = s + product; + + const float3 U = safe_normalize(C - dot(C, A) * A); + + float q = 1.0f; + const float det = ((v * s + u * t) * sin_alpha); + if(det != 0.0f) { + q = ((v * t - u * s) * cos_alpha - v) / det; + } + const float temp = max(1.0f - q*q, 0.0f); -ccl_device float triangle_light_pdf(KernelGlobals *kg, - const float3 Ng, const float3 I, float t) -{ - float pdf = kernel_data.integrator.pdf_triangles; - float cos_pi = fabsf(dot(Ng, I)); + const float3 C_ = safe_normalize(q * A + sqrtf(temp) * U); - if(cos_pi == 0.0f) - return 0.0f; - - return t*t*pdf/cos_pi; + /* Finally, select a random point along the edge of the new triangle + * That point on the spherical triangle is the sampled ray direction */ + const float z = 1.0f - randv * (1.0f - dot(C_, B)); + ls->D = z * B + safe_sqrtf(1.0f - z*z) * safe_normalize(C_ - dot(C_, B) * B); + + /* calculate intersection with the planar triangle */ + if(!ray_triangle_intersect(P, ls->D, FLT_MAX, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + (ssef*)V, +#else + V[0], V[1], V[2], +#endif + &ls->u, &ls->v, &ls->t)) { + ls->pdf = 0.0f; + return; + } + + ls->P = P + ls->D * ls->t; + + /* pdf_triangles is calculated over triangle area, but we're sampling over solid angle */ + if(UNLIKELY(solid_angle == 0.0f)) { + ls->pdf = 0.0f; + return; + } + else { + if(has_motion) { + /* get the center frame vertices, this is what the PDF was calculated from */ + triangle_world_space_vertices(kg, object, prim, -1.0f, V); + area = triangle_area(V[0], V[1], V[2]); + } + const float pdf = area * kernel_data.integrator.pdf_triangles; + ls->pdf = pdf / solid_angle; + } + } + else { + /* compute random point in triangle */ + randu = sqrtf(randu); + + const float u = 1.0f - randu; + const float v = randv*randu; + const float t = 1.0f - u - v; + ls->P = u * V[0] + v * V[1] + t * V[2]; + /* compute incoming direction, distance and pdf */ + ls->D = normalize_len(ls->P - P, &ls->t); + ls->pdf = triangle_light_pdf_area(kg, ls->Ng, -ls->D, ls->t); + if(has_motion && area != 0.0f) { + /* scale the PDF. + * area = the area the sample was taken from + * area_pre = the are from which pdf_triangles was calculated from */ + triangle_world_space_vertices(kg, object, prim, -1.0f, V); + const float area_pre = triangle_area(V[0], V[1], V[2]); + ls->pdf = ls->pdf * area_pre / area; + } + ls->u = u; + ls->v = v; + } } /* Light Distribution */ -ccl_device int light_distribution_sample(KernelGlobals *kg, float randt) +ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu) { - /* this is basically std::upper_bound as used by pbrt, to find a point light or + /* This is basically std::upper_bound as used by pbrt, to find a point light or * triangle to emit from, proportional to area. a good improvement would be to * also sample proportional to power, though it's not so well defined with - * OSL shaders. */ + * arbitrary shaders. */ int first = 0; int len = kernel_data.integrator.num_distribution + 1; + float r = *randu; while(len > 0) { int half_len = len >> 1; int middle = first + half_len; - if(randt < kernel_tex_fetch(__light_distribution, middle).x) { + if(r < kernel_tex_fetch(__light_distribution, middle).x) { len = half_len; } else { @@ -843,9 +1047,17 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float randt) } } - /* clamping should not be needed but float rounding errors seem to - * make this fail on rare occasions */ - return clamp(first-1, 0, kernel_data.integrator.num_distribution-1); + /* Clamping should not be needed but float rounding errors seem to + * make this fail on rare occasions. */ + int index = clamp(first-1, 0, kernel_data.integrator.num_distribution-1); + + /* Rescale to reuse random number. this helps the 2D samples within + * each area light be stratified as well. */ + float distr_min = kernel_tex_fetch(__light_distribution, index).x; + float distr_max = kernel_tex_fetch(__light_distribution, index+1).x; + *randu = (r - distr_min)/(distr_max - distr_min); + + return index; } /* Generic Light */ @@ -857,7 +1069,6 @@ ccl_device bool light_select_reached_max_bounces(KernelGlobals *kg, int index, i } ccl_device_noinline bool light_sample(KernelGlobals *kg, - float randt, float randu, float randv, float time, @@ -866,7 +1077,7 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg, LightSample *ls) { /* sample index */ - int index = light_distribution_sample(kg, randt); + int index = light_distribution_sample(kg, &randu); /* fetch light data */ float4 l = kernel_tex_fetch(__light_distribution, index); @@ -876,10 +1087,7 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg, int object = __float_as_int(l.w); int shader_flag = __float_as_int(l.z); - triangle_light_sample(kg, prim, object, randu, randv, time, ls); - /* compute incoming direction, distance and pdf */ - ls->D = normalize_len(ls->P - P, &ls->t); - ls->pdf = triangle_light_pdf(kg, ls->Ng, -ls->D, ls->t); + triangle_light_sample(kg, prim, object, randu, randv, time, ls, P); ls->shader |= shader_flag; return (ls->pdf > 0.0f); } diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 9cd7ffb181d..fff7f4cfdb7 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -194,8 +194,38 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg, #endif /* __DENOISING_FEATURES__ */ } +#ifdef __KERNEL_DEBUG__ +ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg, + ccl_global float *buffer, + PathRadiance *L, + int sample) +{ + int flag = kernel_data.film.pass_flag; + if(flag & PASS_BVH_TRAVERSED_NODES) { + kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_nodes, + sample, + L->debug_data.num_bvh_traversed_nodes); + } + if(flag & PASS_BVH_TRAVERSED_INSTANCES) { + kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances, + sample, + L->debug_data.num_bvh_traversed_instances); + } + if(flag & PASS_BVH_INTERSECTIONS) { + kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_intersections, + sample, + L->debug_data.num_bvh_intersections); + } + if(flag & PASS_RAY_BOUNCES) { + kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces, + sample, + L->debug_data.num_ray_bounces); + } +} +#endif /* __KERNEL_DEBUG__ */ + ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, - ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput) + ShaderData *sd, ccl_addr_space PathState *state, float3 throughput) { #ifdef __PASSES__ int path_flag = state->flag; @@ -213,6 +243,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl kernel_data.film.pass_alpha_threshold == 0.0f || average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) { + int sample = state->sample; if(sample == 0) { if(flag & PASS_DEPTH) { @@ -230,7 +261,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl } if(flag & PASS_NORMAL) { - float3 normal = sd->N; + float3 normal = shader_bsdf_average_normal(kg, sd); kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal); } if(flag & PASS_UV) { @@ -334,19 +365,11 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f } ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer, - int sample, PathRadiance *L, float alpha, bool is_shadow_catcher) + int sample, PathRadiance *L) { if(L) { - float3 L_sum; -#ifdef __SHADOW_TRICKS__ - if(is_shadow_catcher) { - L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha); - } - else -#endif /* __SHADOW_TRICKS__ */ - { - L_sum = path_radiance_clamp_and_sum(kg, L); - } + float alpha; + float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha); kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha)); @@ -361,16 +384,7 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float * # endif if(kernel_data.film.pass_denoising_clean) { float3 noisy, clean; -#ifdef __SHADOW_TRICKS__ - if(is_shadow_catcher) { - noisy = L_sum; - clean = make_float3(0.0f, 0.0f, 0.0f); - } - else -#endif /* __SHADOW_TRICKS__ */ - { - path_radiance_split_denoising(kg, L, &noisy, &clean); - } + path_radiance_split_denoising(kg, L, &noisy, &clean); kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample, noisy); kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean, @@ -389,6 +403,11 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float * sample, L->denoising_depth); } #endif /* __DENOISING_FEATURES__ */ + + +#ifdef __KERNEL_DEBUG__ + kernel_write_debug_passes(kg, buffer, L, sample); +#endif } else { kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f)); diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index fc093ad8319..793fede0deb 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -48,25 +48,308 @@ #include "kernel/kernel_path_volume.h" #include "kernel/kernel_path_subsurface.h" +CCL_NAMESPACE_BEGIN + +ccl_device_forceinline bool kernel_path_scene_intersect( + KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + Intersection *isect, + PathRadiance *L) +{ + uint visibility = path_state_ray_visibility(kg, state); + +#ifdef __HAIR__ + float difl = 0.0f, extmax = 0.0f; + uint lcg_state = 0; + + if(kernel_data.bvh.have_curves) { + if((kernel_data.cam.resolution == 1) && (state->flag & PATH_RAY_CAMERA)) { + float3 pixdiff = ray->dD.dx + ray->dD.dy; + /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ + difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; + } + + extmax = kernel_data.curve.maximum_width; + lcg_state = lcg_state_init_addrspace(state, 0x51633e2d); + } + + if(path_state_ao_bounce(kg, state)) { + visibility = PATH_RAY_SHADOW; + ray->t = kernel_data.background.ao_distance; + } + + bool hit = scene_intersect(kg, *ray, visibility, isect, &lcg_state, difl, extmax); +#else + bool hit = scene_intersect(kg, *ray, visibility, isect, NULL, 0.0f, 0.0f); +#endif /* __HAIR__ */ + #ifdef __KERNEL_DEBUG__ -# include "kernel/kernel_debug.h" -#endif + if(state->flag & PATH_RAY_CAMERA) { + L->debug_data.num_bvh_traversed_nodes += isect->num_traversed_nodes; + L->debug_data.num_bvh_traversed_instances += isect->num_traversed_instances; + L->debug_data.num_bvh_intersections += isect->num_intersections; + } + L->debug_data.num_ray_bounces++; +#endif /* __KERNEL_DEBUG__ */ -CCL_NAMESPACE_BEGIN + return hit; +} + +ccl_device_forceinline void kernel_path_lamp_emission( + KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + float3 throughput, + ccl_addr_space Intersection *isect, + ShaderData *emission_sd, + PathRadiance *L) +{ +#ifdef __LAMP_MIS__ + if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) { + /* ray starting from previous non-transparent bounce */ + Ray light_ray; + + light_ray.P = ray->P - state->ray_t*ray->D; + state->ray_t += isect->t; + light_ray.D = ray->D; + light_ray.t = state->ray_t; + light_ray.time = ray->time; + light_ray.dD = ray->dD; + light_ray.dP = ray->dP; + + /* intersect with lamp */ + float3 emission; + + if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission)) + path_radiance_accum_emission(L, state, throughput, emission); + } +#endif /* __LAMP_MIS__ */ +} + +ccl_device_forceinline void kernel_path_background( + KernelGlobals *kg, + ccl_addr_space PathState *state, + ccl_addr_space Ray *ray, + float3 throughput, + ShaderData *emission_sd, + PathRadiance *L) +{ + /* eval background shader if nothing hit */ + if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) { + L->transparent += average(throughput); + +#ifdef __PASSES__ + if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) +#endif /* __PASSES__ */ + return; + } + +#ifdef __BACKGROUND__ + /* sample background shader */ + float3 L_background = indirect_background(kg, emission_sd, state, ray); + path_radiance_accum_background(L, state, throughput, L_background); +#endif /* __BACKGROUND__ */ +} + +#ifndef __SPLIT_KERNEL__ + +#ifdef __VOLUME__ +ccl_device_forceinline VolumeIntegrateResult kernel_path_volume( + KernelGlobals *kg, + ShaderData *sd, + PathState *state, + Ray *ray, + float3 *throughput, + ccl_addr_space Intersection *isect, + bool hit, + ShaderData *emission_sd, + PathRadiance *L) +{ + /* Sanitize volume stack. */ + if(!hit) { + kernel_volume_clean_stack(kg, state->volume_stack); + } + /* volume attenuation, emission, scatter */ + if(state->volume_stack[0].shader != SHADER_NONE) { + Ray volume_ray = *ray; + volume_ray.t = (hit)? isect->t: FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); + +# ifdef __VOLUME_DECOUPLED__ + int sampling_method = volume_stack_sampling_method(kg, state->volume_stack); + bool direct = (state->flag & PATH_RAY_CAMERA) != 0; + bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, direct, sampling_method); + + if(decoupled) { + /* cache steps along volume for repeated sampling */ + VolumeSegment volume_segment; + + shader_setup_from_volume(kg, sd, &volume_ray); + kernel_volume_decoupled_record(kg, state, + &volume_ray, sd, &volume_segment, heterogeneous); + + volume_segment.sampling_method = sampling_method; + + /* emission */ + if(volume_segment.closure_flag & SD_EMISSION) + path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission); + + /* scattering */ + VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; + + if(volume_segment.closure_flag & SD_SCATTER) { + int all = kernel_data.integrator.sample_all_lights_indirect; + + /* direct light sampling */ + kernel_branched_path_volume_connect_light(kg, sd, + emission_sd, *throughput, state, L, all, + &volume_ray, &volume_segment); + + /* indirect sample. if we use distance sampling and take just + * one sample for direct and indirect light, we could share + * this computation, but makes code a bit complex */ + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); + float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); + + result = kernel_volume_decoupled_scatter(kg, + state, &volume_ray, sd, throughput, + rphase, rscatter, &volume_segment, NULL, true); + } + + /* free cached steps */ + kernel_volume_decoupled_free(kg, &volume_segment); + + if(result == VOLUME_PATH_SCATTERED) { + if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) + return VOLUME_PATH_SCATTERED; + else + return VOLUME_PATH_MISSED; + } + else { + *throughput *= volume_segment.accum_transmittance; + } + } + else +# endif /* __VOLUME_DECOUPLED__ */ + { + /* integrate along volume segment with distance sampling */ + VolumeIntegrateResult result = kernel_volume_integrate( + kg, state, sd, &volume_ray, L, throughput, heterogeneous); + +# ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) + return VOLUME_PATH_SCATTERED; + else + return VOLUME_PATH_MISSED; + } +# endif /* __VOLUME_SCATTER__ */ + } + } + + return VOLUME_PATH_ATTENUATED; +} +#endif /* __VOLUME__ */ + +#endif /* __SPLIT_KERNEL__ */ + +ccl_device_forceinline bool kernel_path_shader_apply( + KernelGlobals *kg, + ShaderData *sd, + ccl_addr_space PathState *state, + ccl_addr_space Ray *ray, + float3 throughput, + ShaderData *emission_sd, + PathRadiance *L, + ccl_global float *buffer) +{ +#ifdef __SHADOW_TRICKS__ + if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) { + if(state->flag & PATH_RAY_CAMERA) { + state->flag |= (PATH_RAY_SHADOW_CATCHER | + PATH_RAY_STORE_SHADOW_INFO); + + float3 bg = make_float3(0.0f, 0.0f, 0.0f); + if(!kernel_data.background.transparent) { + bg = indirect_background(kg, emission_sd, state, ray); + } + path_radiance_accum_shadowcatcher(L, throughput, bg); + } + } + else if(state->flag & PATH_RAY_SHADOW_CATCHER) { + /* Only update transparency after shadow catcher bounce. */ + L->shadow_transparency *= + average(shader_bsdf_transparency(kg, sd)); + } +#endif /* __SHADOW_TRICKS__ */ + + /* holdout */ +#ifdef __HOLDOUT__ + if(((sd->flag & SD_HOLDOUT) || + (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) && + (state->flag & PATH_RAY_CAMERA)) + { + if(kernel_data.background.transparent) { + float3 holdout_weight; + if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { + holdout_weight = make_float3(1.0f, 1.0f, 1.0f); + } + else { + holdout_weight = shader_holdout_eval(kg, sd); + } + /* any throughput is ok, should all be identical here */ + L->transparent += average(holdout_weight*throughput); + } + + if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { + return false; + } + } +#endif /* __HOLDOUT__ */ + + /* holdout mask objects do not write data passes */ + kernel_write_data_passes(kg, buffer, L, sd, state, throughput); + + /* blurring of bsdf after bounces, for rays that have a small likelihood + * of following this particular path (diffuse, rough glossy) */ + if(kernel_data.integrator.filter_glossy != FLT_MAX) { + float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; + + if(blur_pdf < 1.0f) { + float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; + shader_bsdf_blur(kg, sd, blur_roughness); + } + } + +#ifdef __EMISSION__ + /* emission */ + if(sd->flag & SD_EMISSION) { + float3 emission = indirect_primitive_emission(kg, sd, sd->ray_length, state->flag, state->ray_pdf); + path_radiance_accum_emission(L, state, throughput, emission); + } +#endif /* __EMISSION__ */ + + return true; +} ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, PathRadiance *L, ccl_addr_space PathState *state, - RNG *rng, float3 throughput, float3 ao_alpha) { /* todo: solve correlation */ float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); float ao_factor = kernel_data.background.ao_factor; float3 ao_N; @@ -83,13 +366,11 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, light_ray.P = ray_offset(sd->P, sd->Ng); light_ray.D = ao_D; light_ray.t = kernel_data.background.ao_distance; -#ifdef __OBJECT_MOTION__ light_ray.time = sd->time; -#endif /* __OBJECT_MOTION__ */ light_ray.dP = sd->dP; light_ray.dD = differential3_zero(); - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) { path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow); } else { @@ -100,265 +381,85 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, #ifndef __SPLIT_KERNEL__ +#if defined(__BRANCHED_PATH__) || defined(__BAKING__) + ccl_device void kernel_path_indirect(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, - RNG *rng, Ray *ray, float3 throughput, - int num_samples, PathState *state, PathRadiance *L) { /* path iteration */ for(;;) { - /* intersect scene */ + /* Find intersection with objects in scene. */ Intersection isect; - uint visibility = path_state_ray_visibility(kg, state); - if(state->bounce > kernel_data.integrator.ao_bounces) { - visibility = PATH_RAY_SHADOW; - ray->t = kernel_data.background.ao_distance; - } - bool hit = scene_intersect(kg, - *ray, - visibility, - &isect, - NULL, - 0.0f, 0.0f); + bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L); -#ifdef __LAMP_MIS__ - if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) { - /* ray starting from previous non-transparent bounce */ - Ray light_ray; - - light_ray.P = ray->P - state->ray_t*ray->D; - state->ray_t += isect.t; - light_ray.D = ray->D; - light_ray.t = state->ray_t; - light_ray.time = ray->time; - light_ray.dD = ray->dD; - light_ray.dP = ray->dP; - - /* intersect with lamp */ - float3 emission; - if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission)) { - path_radiance_accum_emission(L, - throughput, - emission, - state->bounce); - } - } -#endif /* __LAMP_MIS__ */ + /* Find intersection with lamps and compute emission for MIS. */ + kernel_path_lamp_emission(kg, state, ray, throughput, &isect, emission_sd, L); #ifdef __VOLUME__ - /* Sanitize volume stack. */ - if(!hit) { - kernel_volume_clean_stack(kg, state->volume_stack); - } - /* volume attenuation, emission, scatter */ - if(state->volume_stack[0].shader != SHADER_NONE) { - Ray volume_ray = *ray; - volume_ray.t = (hit)? isect.t: FLT_MAX; - - bool heterogeneous = - volume_stack_is_heterogeneous(kg, - state->volume_stack); - -# ifdef __VOLUME_DECOUPLED__ - int sampling_method = - volume_stack_sampling_method(kg, - state->volume_stack); - bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method); - - if(decoupled) { - /* cache steps along volume for repeated sampling */ - VolumeSegment volume_segment; - - shader_setup_from_volume(kg, - sd, - &volume_ray); - kernel_volume_decoupled_record(kg, - state, - &volume_ray, - sd, - &volume_segment, - heterogeneous); - - volume_segment.sampling_method = sampling_method; - - /* emission */ - if(volume_segment.closure_flag & SD_EMISSION) { - path_radiance_accum_emission(L, - throughput, - volume_segment.accum_emission, - state->bounce); - } - - /* scattering */ - VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; - - if(volume_segment.closure_flag & SD_SCATTER) { - int all = kernel_data.integrator.sample_all_lights_indirect; - - /* direct light sampling */ - kernel_branched_path_volume_connect_light(kg, - rng, - sd, - emission_sd, - throughput, - state, - L, - all, - &volume_ray, - &volume_segment); - - /* indirect sample. if we use distance sampling and take just - * one sample for direct and indirect light, we could share - * this computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); - - result = kernel_volume_decoupled_scatter(kg, - state, - &volume_ray, - sd, - &throughput, - rphase, - rscatter, - &volume_segment, - NULL, - true); - } - - /* free cached steps */ - kernel_volume_decoupled_free(kg, &volume_segment); - - if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_volume_bounce(kg, - rng, - sd, - &throughput, - state, - L, - ray)) - { - continue; - } - else { - break; - } - } - else { - throughput *= volume_segment.accum_transmittance; - } - } - else -# endif /* __VOLUME_DECOUPLED__ */ - { - /* integrate along volume segment with distance sampling */ - VolumeIntegrateResult result = kernel_volume_integrate( - kg, state, sd, &volume_ray, L, &throughput, rng, heterogeneous); - -# ifdef __VOLUME_SCATTER__ - if(result == VOLUME_PATH_SCATTERED) { - /* direct lighting */ - kernel_path_volume_connect_light(kg, - rng, - sd, - emission_sd, - throughput, - state, - L); - - /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, - rng, - sd, - &throughput, - state, - L, - ray)) - { - continue; - } - else { - break; - } - } -# endif /* __VOLUME_SCATTER__ */ - } + /* Volume integration. */ + VolumeIntegrateResult result = kernel_path_volume(kg, + sd, + state, + ray, + &throughput, + &isect, + hit, + emission_sd, + L); + + if(result == VOLUME_PATH_SCATTERED) { + continue; + } + else if(result == VOLUME_PATH_MISSED) { + break; } -#endif /* __VOLUME__ */ +#endif /* __VOLUME__*/ + /* Shade background. */ if(!hit) { -#ifdef __BACKGROUND__ - /* sample background shader */ - float3 L_background = indirect_background(kg, emission_sd, state, ray); - path_radiance_accum_background(L, - state, - throughput, - L_background); -#endif /* __BACKGROUND__ */ - + kernel_path_background(kg, state, ray, throughput, emission_sd, L); break; } - else if(state->bounce > kernel_data.integrator.ao_bounces) { + else if(path_state_ao_bounce(kg, state)) { break; } - /* setup shading */ + /* Setup and evaluate shader. */ shader_setup_from_ray(kg, sd, &isect, ray); - float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF); - shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT); -#ifdef __BRANCHED_PATH__ - shader_merge_closures(sd); -#endif /* __BRANCHED_PATH__ */ - -#ifdef __SHADOW_TRICKS__ - if(!(sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) { - state->flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY; - } -#endif /* __SHADOW_TRICKS__ */ - - /* blurring of bsdf after bounces, for rays that have a small likelihood - * of following this particular path (diffuse, rough glossy) */ - if(kernel_data.integrator.filter_glossy != FLT_MAX) { - float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; - - if(blur_pdf < 1.0f) { - float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; - shader_bsdf_blur(kg, sd, blur_roughness); - } - } - -#ifdef __EMISSION__ - /* emission */ - if(sd->flag & SD_EMISSION) { - float3 emission = indirect_primitive_emission(kg, - sd, - isect.t, - state->flag, - state->ray_pdf); - path_radiance_accum_emission(L, throughput, emission, state->bounce); + shader_eval_surface(kg, sd, state, state->flag); + shader_prepare_closures(sd, state); + + /* Apply shadow catcher, holdout, emission. */ + if(!kernel_path_shader_apply(kg, + sd, + state, + ray, + throughput, + emission_sd, + L, + NULL)) + { + break; } -#endif /* __EMISSION__ */ /* path termination. this is a strange place to put the termination, it's * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate */ - float probability = - path_state_terminate_probability(kg, - state, - throughput*num_samples); + float probability = path_state_continuation_probability(kg, state, throughput); if(probability == 0.0f) { break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE); + float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -371,7 +472,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) { - kernel_path_ao(kg, sd, emission_sd, L, state, rng, throughput, make_float3(0.0f, 0.0f, 0.0f)); + kernel_path_ao(kg, sd, emission_sd, L, state, throughput, make_float3(0.0f, 0.0f, 0.0f)); } #endif /* __AO__ */ @@ -379,22 +480,18 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, /* bssrdf scatter to a different location on the same object, replacing * the closures with a diffuse BSDF */ if(sd->flag & SD_BSSRDF) { - float bssrdf_probability; - ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); + float bssrdf_u, bssrdf_v; + path_state_rng_2D(kg, + state, + PRNG_BSDF_U, + &bssrdf_u, &bssrdf_v); - /* modify throughput for picking bssrdf or bsdf */ - throughput *= bssrdf_probability; + const ShaderClosure *sc = shader_bssrdf_pick(sd, &throughput, &bssrdf_u); /* do bssrdf scatter step if we picked a bssrdf closure */ if(sc) { - uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb); - - float bssrdf_u, bssrdf_v; - path_state_rng_2D(kg, - rng, - state, - PRNG_BSDF_U, - &bssrdf_u, &bssrdf_v); + uint lcg_state = lcg_state_init(state, 0x68bc21eb); + subsurface_scatter_step(kg, sd, state, @@ -412,7 +509,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, int all = (kernel_data.integrator.sample_all_lights_indirect) || (state->flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light(kg, - rng, sd, emission_sd, state, @@ -423,38 +519,24 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, } #endif /* defined(__EMISSION__) */ - if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray)) + if(!kernel_path_surface_bounce(kg, sd, &throughput, state, &L->state, ray)) break; } } +#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */ -ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, - RNG *rng, - int sample, - Ray ray, - ccl_global float *buffer, - PathRadiance *L, - bool *is_shadow_catcher) +ccl_device_forceinline void kernel_path_integrate( + KernelGlobals *kg, + PathState *state, + float3 throughput, + Ray *ray, + PathRadiance *L, + ccl_global float *buffer, + ShaderData *emission_sd) { - /* initialize */ - float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - float L_transparent = 0.0f; - - path_radiance_init(L, kernel_data.film.use_light_pass); - - /* shader data memory used for both volumes and surfaces, saves stack space */ + /* Shader data memory used for both volumes and surfaces, saves stack space. */ ShaderData sd; - /* shader data used by emission, shadows, volume stacks */ - ShaderData emission_sd; - - PathState state; - path_state_init(kg, &emission_sd, &state, rng, sample, &ray); - -#ifdef __KERNEL_DEBUG__ - DebugData debug_data; - debug_data_init(&debug_data); -#endif /* __KERNEL_DEBUG__ */ #ifdef __SUBSURFACE__ SubsurfaceIndirectRays ss_indirect; @@ -465,265 +547,82 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, /* path iteration */ for(;;) { - /* intersect scene */ + /* Find intersection with objects in scene. */ Intersection isect; - uint visibility = path_state_ray_visibility(kg, &state); - -#ifdef __HAIR__ - float difl = 0.0f, extmax = 0.0f; - uint lcg_state = 0; - - if(kernel_data.bvh.have_curves) { - if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) { - float3 pixdiff = ray.dD.dx + ray.dD.dy; - /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ - difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; - } - - extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(rng, state.rng_offset, state.sample, 0x51633e2d); - } - - if(state.bounce > kernel_data.integrator.ao_bounces) { - visibility = PATH_RAY_SHADOW; - ray.t = kernel_data.background.ao_distance; - } - - bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax); -#else - bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f); -#endif /* __HAIR__ */ + bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L); -#ifdef __KERNEL_DEBUG__ - if(state.flag & PATH_RAY_CAMERA) { - debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes; - debug_data.num_bvh_traversed_instances += isect.num_traversed_instances; - debug_data.num_bvh_intersections += isect.num_intersections; - } - debug_data.num_ray_bounces++; -#endif /* __KERNEL_DEBUG__ */ - -#ifdef __LAMP_MIS__ - if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) { - /* ray starting from previous non-transparent bounce */ - Ray light_ray; - - light_ray.P = ray.P - state.ray_t*ray.D; - state.ray_t += isect.t; - light_ray.D = ray.D; - light_ray.t = state.ray_t; - light_ray.time = ray.time; - light_ray.dD = ray.dD; - light_ray.dP = ray.dP; - - /* intersect with lamp */ - float3 emission; - - if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission)) - path_radiance_accum_emission(L, throughput, emission, state.bounce); - } -#endif /* __LAMP_MIS__ */ + /* Find intersection with lamps and compute emission for MIS. */ + kernel_path_lamp_emission(kg, state, ray, throughput, &isect, emission_sd, L); #ifdef __VOLUME__ - /* Sanitize volume stack. */ - if(!hit) { - kernel_volume_clean_stack(kg, state.volume_stack); - } - /* volume attenuation, emission, scatter */ - if(state.volume_stack[0].shader != SHADER_NONE) { - Ray volume_ray = ray; - volume_ray.t = (hit)? isect.t: FLT_MAX; - - bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); - -# ifdef __VOLUME_DECOUPLED__ - int sampling_method = volume_stack_sampling_method(kg, state.volume_stack); - bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method); - - if(decoupled) { - /* cache steps along volume for repeated sampling */ - VolumeSegment volume_segment; - - shader_setup_from_volume(kg, &sd, &volume_ray); - kernel_volume_decoupled_record(kg, &state, - &volume_ray, &sd, &volume_segment, heterogeneous); - - volume_segment.sampling_method = sampling_method; - - /* emission */ - if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce); - - /* scattering */ - VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; - - if(volume_segment.closure_flag & SD_SCATTER) { - int all = false; - - /* direct light sampling */ - kernel_branched_path_volume_connect_light(kg, rng, &sd, - &emission_sd, throughput, &state, L, all, - &volume_ray, &volume_segment); - - /* indirect sample. if we use distance sampling and take just - * one sample for direct and indirect light, we could share - * this computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE); - - result = kernel_volume_decoupled_scatter(kg, - &state, &volume_ray, &sd, &throughput, - rphase, rscatter, &volume_segment, NULL, true); - } - - /* free cached steps */ - kernel_volume_decoupled_free(kg, &volume_segment); - - if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) - continue; - else - break; - } - else { - throughput *= volume_segment.accum_transmittance; - } - } - else -# endif /* __VOLUME_DECOUPLED__ */ - { - /* integrate along volume segment with distance sampling */ - VolumeIntegrateResult result = kernel_volume_integrate( - kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous); - -# ifdef __VOLUME_SCATTER__ - if(result == VOLUME_PATH_SCATTERED) { - /* direct lighting */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L); - - /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) - continue; - else - break; - } -# endif /* __VOLUME_SCATTER__ */ - } + /* Volume integration. */ + VolumeIntegrateResult result = kernel_path_volume(kg, + &sd, + state, + ray, + &throughput, + &isect, + hit, + emission_sd, + L); + + if(result == VOLUME_PATH_SCATTERED) { + continue; + } + else if(result == VOLUME_PATH_MISSED) { + break; } -#endif /* __VOLUME__ */ +#endif /* __VOLUME__*/ + /* Shade background. */ if(!hit) { - /* eval background shader if nothing hit */ - if(kernel_data.background.transparent && (state.flag & PATH_RAY_CAMERA)) { - L_transparent += average(throughput); - -#ifdef __PASSES__ - if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) -#endif /* __PASSES__ */ - break; - } - -#ifdef __BACKGROUND__ - /* sample background shader */ - float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(L, &state, throughput, L_background); -#endif /* __BACKGROUND__ */ - + kernel_path_background(kg, state, ray, throughput, emission_sd, L); break; } - else if(state.bounce > kernel_data.integrator.ao_bounces) { + else if(path_state_ao_bounce(kg, state)) { break; } - /* setup shading */ - shader_setup_from_ray(kg, &sd, &isect, &ray); - float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF); - shader_eval_surface(kg, &sd, rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN); - -#ifdef __SHADOW_TRICKS__ - if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) { - if(state.flag & PATH_RAY_CAMERA) { - state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO); - state.catcher_object = sd.object; - if(!kernel_data.background.transparent) { - L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray); - } - } - } - else { - state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY; - } -#endif /* __SHADOW_TRICKS__ */ - - /* holdout */ -#ifdef __HOLDOUT__ - if(((sd.flag & SD_HOLDOUT) || - (sd.object_flag & SD_OBJECT_HOLDOUT_MASK)) && - (state.flag & PATH_RAY_CAMERA)) + /* Setup and evaluate shader. */ + shader_setup_from_ray(kg, &sd, &isect, ray); + shader_eval_surface(kg, &sd, state, state->flag); + shader_prepare_closures(&sd, state); + + /* Apply shadow catcher, holdout, emission. */ + if(!kernel_path_shader_apply(kg, + &sd, + state, + ray, + throughput, + emission_sd, + L, + buffer)) { - if(kernel_data.background.transparent) { - float3 holdout_weight; - if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) { - holdout_weight = make_float3(1.0f, 1.0f, 1.0f); - } - else { - holdout_weight = shader_holdout_eval(kg, &sd); - } - /* any throughput is ok, should all be identical here */ - L_transparent += average(holdout_weight*throughput); - } - - if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) { - break; - } - } -#endif /* __HOLDOUT__ */ - - /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput); - - /* blurring of bsdf after bounces, for rays that have a small likelihood - * of following this particular path (diffuse, rough glossy) */ - if(kernel_data.integrator.filter_glossy != FLT_MAX) { - float blur_pdf = kernel_data.integrator.filter_glossy*state.min_ray_pdf; - - if(blur_pdf < 1.0f) { - float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; - shader_bsdf_blur(kg, &sd, blur_roughness); - } - } - -#ifdef __EMISSION__ - /* emission */ - if(sd.flag & SD_EMISSION) { - /* todo: is isect.t wrong here for transparent surfaces? */ - float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(L, throughput, emission, state.bounce); + break; } -#endif /* __EMISSION__ */ /* path termination. this is a strange place to put the termination, it's * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate */ - float probability = path_state_terminate_probability(kg, &state, throughput); + float probability = path_state_continuation_probability(kg, state, throughput); if(probability == 0.0f) { break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); if(terminate >= probability) break; throughput /= probability; } - kernel_update_denoising_features(kg, &sd, &state, L); + kernel_update_denoising_features(kg, &sd, state, L); #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd)); + kernel_path_ao(kg, &sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, &sd)); } #endif /* __AO__ */ @@ -733,11 +632,10 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, if(sd.flag & SD_BSSRDF) { if(kernel_path_subsurface_scatter(kg, &sd, - &emission_sd, + emission_sd, L, - &state, - rng, - &ray, + state, + ray, &throughput, &ss_indirect)) { @@ -747,24 +645,22 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, #endif /* __SUBSURFACE__ */ /* direct lighting */ - kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L); + kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L); /* compute direct lighting and next bounce */ - if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) + if(!kernel_path_surface_bounce(kg, &sd, &throughput, state, &L->state, ray)) break; } #ifdef __SUBSURFACE__ - kernel_path_subsurface_accum_indirect(&ss_indirect, L); - /* Trace indirect subsurface rays by restarting the loop. this uses less * stack memory than invoking kernel_path_indirect. */ if(ss_indirect.num_rays) { kernel_path_subsurface_setup_indirect(kg, &ss_indirect, - &state, - &ray, + state, + ray, L, &throughput); } @@ -773,16 +669,6 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, } } #endif /* __SUBSURFACE__ */ - -#ifdef __SHADOW_TRICKS__ - *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER); -#endif /* __SHADOW_TRICKS__ */ - -#ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); -#endif /* __KERNEL_DEBUG__ */ - - return 1.0f - L_transparent; } ccl_device void kernel_path_trace(KernelGlobals *kg, @@ -796,25 +682,37 @@ ccl_device void kernel_path_trace(KernelGlobals *kg, rng_state += index; buffer += index*pass_stride; - /* initialize random numbers and ray */ - RNG rng; + /* Initialize random numbers and sample ray. */ + uint rng_hash; Ray ray; - kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); - - /* integrate */ - PathRadiance L; - bool is_shadow_catcher; + kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, &ray); - if(ray.t != 0.0f) { - float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher); - kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher); - } - else { - kernel_write_result(kg, buffer, sample, NULL, 0.0f, false); + if(ray.t == 0.0f) { + kernel_write_result(kg, buffer, sample, NULL); + return; } - path_rng_end(kg, rng_state, rng); + /* Initialize state. */ + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + + PathRadiance L; + path_radiance_init(&L, kernel_data.film.use_light_pass); + + ShaderData emission_sd; + PathState state; + path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray); + + /* Integrate. */ + kernel_path_integrate(kg, + &state, + throughput, + &ray, + &L, + buffer, + &emission_sd); + + kernel_write_result(kg, buffer, sample, &L); } #endif /* __SPLIT_KERNEL__ */ diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index 10816d3e5d1..6e0ec22d581 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -23,7 +23,6 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *emission_sd, PathRadiance *L, ccl_addr_space PathState *state, - RNG *rng, float3 throughput) { int num_samples = kernel_data.integrator.ao_samples; @@ -35,7 +34,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, for(int j = 0; j < num_samples; j++) { float bsdf_u, bsdf_v; - path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); float3 ao_D; float ao_pdf; @@ -49,13 +48,11 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, light_ray.P = ray_offset(sd->P, sd->Ng); light_ray.D = ao_D; light_ray.t = kernel_data.background.ao_distance; -#ifdef __OBJECT_MOTION__ light_ray.time = sd->time; -#endif /* __OBJECT_MOTION__ */ light_ray.dP = sd->dP; light_ray.dD = differential3_zero(); - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) { path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow); } else { @@ -69,7 +66,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, /* bounce off surface and integrate indirect light */ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, - RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd, + ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd, float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L) { float sum_sample_weight = 0.0f; @@ -113,35 +110,38 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba num_samples = ceil_to_int(num_samples_adjust*num_samples); float num_samples_inv = num_samples_adjust/num_samples; - RNG bsdf_rng = cmj_hash(*rng, i); for(int j = 0; j < num_samples; j++) { PathState ps = *state; float3 tp = throughput; Ray bsdf_ray; +#ifdef __SHADOW_TRICKS__ + float shadow_transparency = L->shadow_transparency; +#endif + + ps.rng_hash = cmj_hash(state->rng_hash, i); if(!kernel_branched_path_surface_bounce(kg, - &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, - L, + &L->state, &bsdf_ray, sum_sample_weight)) { continue; } + ps.rng_hash = state->rng_hash; + kernel_path_indirect(kg, indirect_sd, emission_sd, - rng, &bsdf_ray, tp*num_samples_inv, - num_samples, &ps, L); @@ -149,6 +149,10 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba * for the next samples */ path_radiance_sum_indirect(L); path_radiance_reset_indirect(L); + +#ifdef __SHADOW_TRICKS__ + L->shadow_transparency = shadow_transparency; +#endif } } } @@ -160,7 +164,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, ShaderData *emission_sd, PathRadiance *L, PathState *state, - RNG *rng, Ray *ray, float3 throughput) { @@ -171,17 +174,17 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, continue; /* set up random number generator */ - uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb); + uint lcg_state = lcg_state_init(state, 0x68bc21eb); int num_samples = kernel_data.integrator.subsurface_samples; float num_samples_inv = 1.0f/num_samples; - RNG bssrdf_rng = cmj_hash(*rng, i); + uint bssrdf_rng_hash = cmj_hash(state->rng_hash, i); /* do subsurface scatter step with copy of shader data, this will * replace the BSSRDF with a diffuse BSDF closure */ for(int j = 0; j < num_samples; j++) { SubsurfaceIntersection ss_isect; float bssrdf_u, bssrdf_v; - path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); + path_branched_rng_2D(kg, bssrdf_rng_hash, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); int num_hits = subsurface_scatter_multi_intersect(kg, &ss_isect, sd, @@ -234,7 +237,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, (state->flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light( kg, - rng, &bssrdf_sd, emission_sd, &hit_state, @@ -248,7 +250,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, /* indirect light */ kernel_branched_path_surface_indirect_light( kg, - rng, &bssrdf_sd, indirect_sd, emission_sd, @@ -262,17 +263,15 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ -ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, - RNG *rng, - int sample, - Ray ray, - ccl_global float *buffer, - PathRadiance *L, - bool *is_shadow_catcher) +ccl_device void kernel_branched_path_integrate(KernelGlobals *kg, + uint rng_hash, + int sample, + Ray ray, + ccl_global float *buffer, + PathRadiance *L) { /* initialize */ float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - float L_transparent = 0.0f; path_radiance_init(L, kernel_data.film.use_light_pass); @@ -282,48 +281,16 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, ShaderData emission_sd, indirect_sd; PathState state; - path_state_init(kg, &emission_sd, &state, rng, sample, &ray); - -#ifdef __KERNEL_DEBUG__ - DebugData debug_data; - debug_data_init(&debug_data); -#endif /* __KERNEL_DEBUG__ */ + path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray); /* Main Loop * Here we only handle transparency intersections from the camera ray. * Indirect bounces are handled in kernel_branched_path_surface_indirect_light(). */ for(;;) { - /* intersect scene */ + /* Find intersection with objects in scene. */ Intersection isect; - uint visibility = path_state_ray_visibility(kg, &state); - -#ifdef __HAIR__ - float difl = 0.0f, extmax = 0.0f; - uint lcg_state = 0; - - if(kernel_data.bvh.have_curves) { - if(kernel_data.cam.resolution == 1) { - float3 pixdiff = ray.dD.dx + ray.dD.dy; - /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ - difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; - } - - extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(rng, state.rng_offset, state.sample, 0x51633e2d); - } - - bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax); -#else - bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f); -#endif /* __HAIR__ */ - -#ifdef __KERNEL_DEBUG__ - debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes; - debug_data.num_bvh_traversed_instances += isect.num_traversed_instances; - debug_data.num_bvh_intersections += isect.num_intersections; - debug_data.num_ray_bounces++; -#endif /* __KERNEL_DEBUG__ */ + bool hit = kernel_path_scene_intersect(kg, &state, &ray, &isect, L); #ifdef __VOLUME__ /* Sanitize volume stack. */ @@ -353,7 +320,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, int all = kernel_data.integrator.sample_all_lights_direct; - kernel_branched_path_volume_connect_light(kg, rng, &sd, + kernel_branched_path_volume_connect_light(kg, &sd, &emission_sd, throughput, &state, L, all, &volume_ray, &volume_segment); @@ -372,30 +339,25 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, /* scatter sample. if we use distance sampling and take just one * sample for direct and indirect light, we could share this * computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL); + float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, &ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false); - (void)result; - kernel_assert(result == VOLUME_PATH_SCATTERED); - - if(kernel_path_volume_bounce(kg, - rng, + if(result == VOLUME_PATH_SCATTERED && + kernel_path_volume_bounce(kg, &sd, &tp, &ps, - L, + &L->state, &pray)) { kernel_path_indirect(kg, &indirect_sd, &emission_sd, - rng, &pray, tp*num_samples_inv, - num_samples, &ps, L); @@ -409,7 +371,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, /* emission and transmittance */ if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce); + path_radiance_accum_emission(L, &state, throughput, volume_segment.accum_emission); throughput *= volume_segment.accum_transmittance; /* free cached steps */ @@ -431,29 +393,26 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, path_state_branch(&ps, j, num_samples); VolumeIntegrateResult result = kernel_volume_integrate( - kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous); + kg, &ps, &sd, &volume_ray, L, &tp, heterogeneous); #ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* todo: support equiangular, MIS and all light sampling. * alternatively get decoupled ray marching working on the GPU */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L); + kernel_path_volume_connect_light(kg, &sd, &emission_sd, tp, &state, L); if(kernel_path_volume_bounce(kg, - rng, &sd, &tp, &ps, - L, + &L->state, &pray)) { kernel_path_indirect(kg, &indirect_sd, &emission_sd, - rng, &pray, tp, - num_samples, &ps, L); @@ -472,89 +431,42 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, } #endif /* __VOLUME__ */ + /* Shade background. */ if(!hit) { - /* eval background shader if nothing hit */ - if(kernel_data.background.transparent) { - L_transparent += average(throughput); - -#ifdef __PASSES__ - if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) -#endif /* __PASSES__ */ - break; - } - -#ifdef __BACKGROUND__ - /* sample background shader */ - float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(L, &state, throughput, L_background); -#endif /* __BACKGROUND__ */ - + kernel_path_background(kg, &state, &ray, throughput, &emission_sd, L); break; } - /* setup shading */ + /* Setup and evaluate shader. */ shader_setup_from_ray(kg, &sd, &isect, &ray); - shader_eval_surface(kg, &sd, rng, &state, 0.0f, state.flag, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, &sd, &state, state.flag); shader_merge_closures(&sd); -#ifdef __SHADOW_TRICKS__ - if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) { - if(state.flag & PATH_RAY_CAMERA) { - state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO); - state.catcher_object = sd.object; - if(!kernel_data.background.transparent) { - L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray); - } - } - } - else { - state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY; - } -#endif /* __SHADOW_TRICKS__ */ - - /* holdout */ -#ifdef __HOLDOUT__ - if((sd.flag & SD_HOLDOUT) || (sd.object_flag & SD_OBJECT_HOLDOUT_MASK)) { - if(kernel_data.background.transparent) { - float3 holdout_weight; - if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) { - holdout_weight = make_float3(1.0f, 1.0f, 1.0f); - } - else { - holdout_weight = shader_holdout_eval(kg, &sd); - } - /* any throughput is ok, should all be identical here */ - L_transparent += average(holdout_weight*throughput); - } - if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) { - break; - } - } -#endif /* __HOLDOUT__ */ - - /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput); - -#ifdef __EMISSION__ - /* emission */ - if(sd.flag & SD_EMISSION) { - float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(L, throughput, emission, state.bounce); + /* Apply shadow catcher, holdout, emission. */ + if(!kernel_path_shader_apply(kg, + &sd, + &state, + &ray, + throughput, + &emission_sd, + L, + buffer)) + { + break; } -#endif /* __EMISSION__ */ /* transparency termination */ if(state.flag & PATH_RAY_TRANSPARENT) { /* path termination. this is a strange place to put the termination, it's * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate */ - float probability = path_state_terminate_probability(kg, &state, throughput); + float probability = path_state_continuation_probability(kg, &state, throughput); if(probability == 0.0f) { break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -568,7 +480,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput); + kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, throughput); } #endif /* __AO__ */ @@ -576,7 +488,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, /* bssrdf scatter to a different location on the same object */ if(sd.flag & SD_BSSRDF) { kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd, - L, &state, rng, &ray, throughput); + L, &state, &ray, throughput); } #endif /* __SUBSURFACE__ */ @@ -588,13 +500,13 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, if(kernel_data.integrator.use_direct_light) { int all = (kernel_data.integrator.sample_all_lights_direct) || (state.flag & PATH_RAY_SHADOW_CATCHER); - kernel_branched_path_surface_connect_light(kg, rng, + kernel_branched_path_surface_connect_light(kg, &sd, &emission_sd, &hit_state, throughput, 1.0f, L, all); } #endif /* __EMISSION__ */ /* indirect light */ - kernel_branched_path_surface_indirect_light(kg, rng, + kernel_branched_path_surface_indirect_light(kg, &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L); /* continue in case of transparency */ @@ -623,16 +535,6 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); #endif /* __VOLUME__ */ } - -#ifdef __SHADOW_TRICKS__ - *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER); -#endif /* __SHADOW_TRICKS__ */ - -#ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); -#endif /* __KERNEL_DEBUG__ */ - - return 1.0f - L_transparent; } ccl_device void kernel_branched_path_trace(KernelGlobals *kg, @@ -647,24 +549,21 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg, buffer += index*pass_stride; /* initialize random numbers and ray */ - RNG rng; + uint rng_hash; Ray ray; - kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); + kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, &ray); /* integrate */ PathRadiance L; - bool is_shadow_catcher; if(ray.t != 0.0f) { - float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher); - kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher); + kernel_branched_path_integrate(kg, rng_hash, sample, ray, buffer, &L); + kernel_write_result(kg, buffer, sample, &L); } else { - kernel_write_result(kg, buffer, sample, NULL, 0.0f, false); + kernel_write_result(kg, buffer, sample, NULL); } - - path_rng_end(kg, rng_state, rng); } #endif /* __SPLIT_KERNEL__ */ diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h index 82f83deb595..54dd278a185 100644 --- a/intern/cycles/kernel/kernel_path_common.h +++ b/intern/cycles/kernel/kernel_path_common.h @@ -22,7 +22,7 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int x, int y, - RNG *rng, + uint *rng_hash, ccl_addr_space Ray *ray) { float filter_u; @@ -34,20 +34,20 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, *rng_state = hash_int_2d(x, y); } - path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v); + path_rng_init(kg, rng_state, sample, num_samples, rng_hash, x, y, &filter_u, &filter_v); /* sample camera ray */ float lens_u = 0.0f, lens_v = 0.0f; if(kernel_data.cam.aperturesize > 0.0f) - path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v); + path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v); float time = 0.0f; #ifdef __CAMERA_MOTION__ if(kernel_data.cam.shuttertime != -1.0f) - time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME); + time = path_rng_1D(kg, *rng_hash, sample, num_samples, PRNG_TIME); #endif camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray); diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index 5d92fd12201..eccee54c0e3 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -19,15 +19,17 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void path_state_init(KernelGlobals *kg, ShaderData *stack_sd, ccl_addr_space PathState *state, - RNG *rng, + uint rng_hash, int sample, ccl_addr_space Ray *ray) { state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP; + state->rng_hash = rng_hash; state->rng_offset = PRNG_BASE_NUM; state->sample = sample; state->num_samples = kernel_data.integrator.aa_samples; + state->branch_factor = 1.0f; state->bounce = 0; state->diffuse_bounce = 0; @@ -58,16 +60,12 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, /* Initialize volume stack with volume we are inside of. */ kernel_volume_stack_init(kg, stack_sd, state, ray, state->volume_stack); /* Seed RNG for cases where we can't use stratified samples .*/ - state->rng_congruential = lcg_init(*rng + sample*0x51633e2d); + state->rng_congruential = lcg_init(rng_hash + sample*0x51633e2d); } else { state->volume_stack[0].shader = SHADER_NONE; } #endif - -#ifdef __SHADOW_TRICKS__ - state->catcher_object = OBJECT_NONE; -#endif } ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathState *state, int label) @@ -78,12 +76,12 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta state->flag |= PATH_RAY_TRANSPARENT; state->transparent_bounce++; - /* don't increase random number generator offset here, to avoid some - * unwanted patterns, see path_state_rng_1D_for_decision */ - if(!kernel_data.integrator.transparent_shadows) state->flag |= PATH_RAY_MIS_SKIP; + /* random number generator next bounce */ + state->rng_offset += PRNG_BOUNCE_NUM; + return; } @@ -146,7 +144,7 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta #endif } -ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state) +ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, ccl_addr_space PathState *state) { uint flag = state->flag & PATH_RAY_ALL_VISIBILITY; @@ -160,17 +158,28 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s return flag; } -ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput) +ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg, + ccl_addr_space PathState *state, + const float3 throughput) { if(state->flag & PATH_RAY_TRANSPARENT) { - /* transparent rays treated separately */ - if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) + /* Transparent rays are treated separately with own max bounces. */ + if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) { return 0.0f; - else if(state->transparent_bounce <= kernel_data.integrator.transparent_min_bounce) + } + /* Do at least one bounce without RR. */ + else if(state->transparent_bounce <= 1) { return 1.0f; + } +#ifdef __SHADOW_TRICKS__ + /* Exception for shadow catcher not working correctly with RR. */ + else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) { + return 1.0f; + } +#endif } else { - /* other rays */ + /* Test max bounces for various ray types. */ if((state->bounce >= kernel_data.integrator.max_bounce) || (state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) || (state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) || @@ -181,13 +190,21 @@ ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_ { return 0.0f; } - else if(state->bounce <= kernel_data.integrator.min_bounce) { + /* Do at least one bounce without RR. */ + else if(state->bounce <= 1) { return 1.0f; } +#ifdef __SHADOW_TRICKS__ + /* Exception for shadow catcher not working correctly with RR. */ + else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) { + return 1.0f; + } +#endif } - /* probalistic termination */ - return average(throughput); /* todo: try using max here */ + /* Probalistic termination: use sqrt() to roughly match typical view + * transform and do path termination a bit later on average. */ + return min(sqrtf(max3(fabs(throughput)) * state->branch_factor), 1.0f); } /* TODO(DingTo): Find more meaningful name for this */ @@ -200,5 +217,30 @@ ccl_device_inline void path_state_modify_bounce(ccl_addr_space PathState *state, state->bounce -= 1; } +ccl_device_inline bool path_state_ao_bounce(KernelGlobals *kg, ccl_addr_space PathState *state) +{ + if(state->bounce <= kernel_data.integrator.ao_bounces) { + return false; + } + + int bounce = state->bounce - state->transmission_bounce - (state->glossy_bounce > 0); + return (bounce > kernel_data.integrator.ao_bounces); +} + +ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, + int branch, + int num_branches) +{ + state->rng_offset += PRNG_BOUNCE_NUM; + + if(num_branches > 1) { + /* Path is splitting into a branch, adjust so that each branch + * still gets a unique sample from the same sequence. */ + state->sample = state->sample*num_branches + branch; + state->num_samples = state->num_samples*num_branches; + state->branch_factor *= num_branches; + } +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h index 10b568ac3dd..1436e8e5a5b 100644 --- a/intern/cycles/kernel/kernel_path_subsurface.h +++ b/intern/cycles/kernel/kernel_path_subsurface.h @@ -28,16 +28,14 @@ bool kernel_path_subsurface_scatter( ShaderData *emission_sd, PathRadiance *L, ccl_addr_space PathState *state, - RNG *rng, ccl_addr_space Ray *ray, ccl_addr_space float3 *throughput, ccl_addr_space SubsurfaceIndirectRays *ss_indirect) { - float bssrdf_probability; - ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); + float bssrdf_u, bssrdf_v; + path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); - /* modify throughput for picking bssrdf or bsdf */ - *throughput *= bssrdf_probability; + const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u); /* do bssrdf scatter step if we picked a bssrdf closure */ if(sc) { @@ -45,13 +43,11 @@ bool kernel_path_subsurface_scatter( * the second one should be converted to a diffuse BSDF to * avoid this. */ - kernel_assert(!ss_indirect->tracing); + kernel_assert(!(state->flag & PATH_RAY_DIFFUSE_ANCESTOR)); - uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb); + uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb); SubsurfaceIntersection ss_isect; - float bssrdf_u, bssrdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); int num_hits = subsurface_scatter_multi_intersect(kg, &ss_isect, sd, @@ -60,7 +56,7 @@ bool kernel_path_subsurface_scatter( bssrdf_u, bssrdf_v, false); # ifdef __VOLUME__ - ss_indirect->need_update_volume_stack = + bool need_update_volume_stack = kernel_data.integrator.use_volumes && sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; # endif /* __VOLUME__ */ @@ -79,29 +75,25 @@ bool kernel_path_subsurface_scatter( sc, false); + kernel_path_surface_connect_light(kg, sd, emission_sd, *throughput, state, L); + ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays]; ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays]; ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays]; - PathRadiance *hit_L = &ss_indirect->L[ss_indirect->num_rays]; + PathRadianceState *hit_L_state = &ss_indirect->L_state[ss_indirect->num_rays]; *hit_state = *state; *hit_ray = *ray; *hit_tp = *throughput; + *hit_L_state = L->state; hit_state->rng_offset += PRNG_BOUNCE_NUM; - path_radiance_init(hit_L, kernel_data.film.use_light_pass); - hit_L->direct_throughput = L->direct_throughput; - path_radiance_copy_indirect(hit_L, L); - - kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L); - if(kernel_path_surface_bounce(kg, - rng, sd, hit_tp, hit_state, - hit_L, + hit_L_state, hit_ray)) { # ifdef __LAMP_MIS__ @@ -109,7 +101,7 @@ bool kernel_path_subsurface_scatter( # endif /* __LAMP_MIS__ */ # ifdef __VOLUME__ - if(ss_indirect->need_update_volume_stack) { + if(need_update_volume_stack) { Ray volume_ray = *ray; /* Setup ray from previous surface point to the new one. */ volume_ray.D = normalize_len(hit_ray->P - volume_ray.P, @@ -122,12 +114,8 @@ bool kernel_path_subsurface_scatter( hit_state->volume_stack); } # endif /* __VOLUME__ */ - path_radiance_reset_indirect(L); ss_indirect->num_rays++; } - else { - path_radiance_accum_sample(L, hit_L, 1); - } } return true; } @@ -137,23 +125,9 @@ bool kernel_path_subsurface_scatter( ccl_device_inline void kernel_path_subsurface_init_indirect( ccl_addr_space SubsurfaceIndirectRays *ss_indirect) { - ss_indirect->tracing = false; ss_indirect->num_rays = 0; } -ccl_device void kernel_path_subsurface_accum_indirect( - ccl_addr_space SubsurfaceIndirectRays *ss_indirect, - PathRadiance *L) -{ - if(ss_indirect->tracing) { - path_radiance_sum_indirect(L); - path_radiance_accum_sample(&ss_indirect->direct_L, L, 1); - if(ss_indirect->num_rays == 0) { - *L = ss_indirect->direct_L; - } - } -} - ccl_device void kernel_path_subsurface_setup_indirect( KernelGlobals *kg, ccl_addr_space SubsurfaceIndirectRays *ss_indirect, @@ -162,20 +136,15 @@ ccl_device void kernel_path_subsurface_setup_indirect( PathRadiance *L, ccl_addr_space float3 *throughput) { - if(!ss_indirect->tracing) { - ss_indirect->direct_L = *L; - } - ss_indirect->tracing = true; - /* Setup state, ray and throughput for indirect SSS rays. */ ss_indirect->num_rays--; - ccl_addr_space Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays]; - PathRadiance *indirect_L = &ss_indirect->L[ss_indirect->num_rays]; + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); *state = ss_indirect->state[ss_indirect->num_rays]; - *ray = *indirect_ray; - *L = *indirect_L; + *ray = ss_indirect->rays[ss_indirect->num_rays]; + L->state = ss_indirect->L_state[ss_indirect->num_rays]; *throughput = ss_indirect->throughputs[ss_indirect->num_rays]; state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM; diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index dcb577e176f..7b566b01b04 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -20,7 +20,6 @@ CCL_NAMESPACE_BEGIN /* branched path tracing: connect path directly to position on one or more lights and add it to L */ ccl_device_noinline void kernel_branched_path_surface_connect_light( KernelGlobals *kg, - RNG *rng, ShaderData *sd, ShaderData *emission_sd, ccl_addr_space PathState *state, @@ -50,12 +49,12 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i)); float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights); - RNG lamp_rng = cmj_hash(*rng, i); + uint lamp_rng_hash = cmj_hash(state->rng_hash, i); for(int j = 0; j < num_samples; j++) { float light_u, light_v; - path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples); + path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_branched_rng_light_termination(kg, lamp_rng_hash, state, j, num_samples); LightSample ls; if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) { @@ -68,7 +67,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } @@ -86,17 +85,16 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( float num_samples_inv = num_samples_adjust/num_samples; for(int j = 0; j < num_samples; j++) { - float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT); float light_u, light_v; - path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples); + path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples); /* only sample triangle lights */ if(kernel_data.integrator.num_all_lights) - light_t = 0.5f*light_t; + light_u = 0.5f*light_u; LightSample ls; - if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { + if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { /* Same as above, probability needs to be corrected since the sampling was forced to select a mesh light. */ if(kernel_data.integrator.num_all_lights) ls.pdf *= 2.0f; @@ -105,7 +103,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } @@ -119,19 +117,18 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( } else { /* sample one light at random */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_state_rng_light_termination(kg, rng, state); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_state_rng_light_termination(kg, state); LightSample ls; - if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { + if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { /* sample random light */ if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp); } @@ -147,14 +144,13 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( /* branched path tracing: bounce off or through surface to with new direction stored in ray */ ccl_device bool kernel_branched_path_surface_bounce( KernelGlobals *kg, - RNG *rng, ShaderData *sd, const ShaderClosure *sc, int sample, int num_samples, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, - PathRadiance *L, + PathRadianceState *L_state, ccl_addr_space Ray *ray, float sum_sample_weight) { @@ -164,7 +160,7 @@ ccl_device bool kernel_branched_path_surface_bounce( float3 bsdf_omega_in; differential3 bsdf_domega_in; float bsdf_u, bsdf_v; - path_branched_rng_2D(kg, rng, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_branched_rng_2D(kg, state->rng_hash, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); int label; label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, @@ -174,7 +170,7 @@ ccl_device bool kernel_branched_path_surface_bounce( return false; /* modify throughput */ - path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); + path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); #ifdef __DENOISING_FEATURES__ state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples); @@ -217,7 +213,7 @@ ccl_device bool kernel_branched_path_surface_bounce( #endif /* path tracing: connect path directly to position on a light and add it to L */ -ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng, +ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state, PathRadiance *L) { @@ -228,7 +224,6 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG #ifdef __SHADOW_TRICKS__ if(state->flag & PATH_RAY_SHADOW_CATCHER) { kernel_branched_path_surface_connect_light(kg, - rng, sd, emission_sd, state, @@ -241,9 +236,8 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG #endif /* sample illumination from lights to find path contribution */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); Ray light_ray; BsdfEval L_light; @@ -254,13 +248,13 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG #endif LightSample ls; - if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { - float terminate = path_state_rng_light_termination(kg, rng, state); + if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { + float terminate = path_state_rng_light_termination(kg, state); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } @@ -274,11 +268,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG /* path tracing: bounce off or through surface to with new direction stored in ray */ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, - RNG *rng, ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, - PathRadiance *L, + PathRadianceState *L_state, ccl_addr_space Ray *ray) { /* no BSDF? we can stop here */ @@ -289,7 +282,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, float3 bsdf_omega_in; differential3 bsdf_domega_in; float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); int label; label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval, @@ -299,7 +292,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, return false; /* modify throughput */ - path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); + path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); /* set labels */ if(!(label & LABEL_TRANSPARENT)) { diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h index dcedf51e479..b6a856baf24 100644 --- a/intern/cycles/kernel/kernel_path_volume.h +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -20,7 +20,6 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_path_volume_connect_light( KernelGlobals *kg, - RNG *rng, ShaderData *sd, ShaderData *emission_sd, float3 throughput, @@ -32,9 +31,8 @@ ccl_device_inline void kernel_path_volume_connect_light( return; /* sample illumination from lights to find path contribution */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); Ray light_ray; BsdfEval L_light; @@ -42,18 +40,16 @@ ccl_device_inline void kernel_path_volume_connect_light( bool is_lamp; /* connect to light from given point where shader has been evaluated */ -# ifdef __OBJECT_MOTION__ light_ray.time = sd->time; -# endif - if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) + if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { - float terminate = path_state_rng_light_termination(kg, rng, state); + float terminate = path_state_rng_light_termination(kg, state); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } @@ -69,11 +65,10 @@ ccl_device #endif bool kernel_path_volume_bounce( KernelGlobals *kg, - RNG *rng, ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, - PathRadiance *L, + PathRadianceState *L_state, ccl_addr_space Ray *ray) { /* sample phase function */ @@ -82,7 +77,7 @@ bool kernel_path_volume_bounce( float3 phase_omega_in; differential3 phase_domega_in; float phase_u, phase_v; - path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v); + path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v); int label; label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval, @@ -92,7 +87,7 @@ bool kernel_path_volume_bounce( return false; /* modify throughput */ - path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label); + path_radiance_bsdf_bounce(kg, L_state, throughput, &phase_eval, phase_pdf, state->bounce, label); /* set labels */ state->ray_pdf = phase_pdf; @@ -120,7 +115,6 @@ bool kernel_path_volume_bounce( #ifndef __SPLIT_KERNEL__ ccl_device void kernel_branched_path_volume_connect_light( KernelGlobals *kg, - RNG *rng, ShaderData *sd, ShaderData *emission_sd, float3 throughput, @@ -138,9 +132,7 @@ ccl_device void kernel_branched_path_volume_connect_light( BsdfEval L_light; bool is_lamp; -# ifdef __OBJECT_MOTION__ light_ray.time = sd->time; -# endif if(sample_all_lights) { /* lamp sampling */ @@ -150,12 +142,12 @@ ccl_device void kernel_branched_path_volume_connect_light( int num_samples = light_select_num_samples(kg, i); float num_samples_inv = 1.0f/(num_samples*kernel_data.integrator.num_all_lights); - RNG lamp_rng = cmj_hash(*rng, i); + uint lamp_rng_hash = cmj_hash(state->rng_hash, i); for(int j = 0; j < num_samples; j++) { /* sample random position on given light */ float light_u, light_v; - path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); LightSample ls; lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls); @@ -163,26 +155,24 @@ ccl_device void kernel_branched_path_volume_connect_light( float3 tp = throughput; /* sample position on volume segment */ - float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE); - float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE); + float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL); + float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); - (void)result; - kernel_assert(result == VOLUME_PATH_SCATTERED); - /* todo: split up light_sample so we don't have to call it again with new position */ - if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) { + if(result == VOLUME_PATH_SCATTERED && + lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) { if(kernel_data.integrator.pdf_triangles != 0.0f) ls.pdf *= 2.0f; - float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples); + float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } @@ -198,40 +188,37 @@ ccl_device void kernel_branched_path_volume_connect_light( for(int j = 0; j < num_samples; j++) { /* sample random position on random triangle */ - float light_t = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_LIGHT); float light_u, light_v; - path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); /* only sample triangle lights */ if(kernel_data.integrator.num_all_lights) - light_t = 0.5f*light_t; + light_u = 0.5f*light_u; LightSample ls; - light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls); + light_sample(kg, light_u, light_v, sd->time, ray->P, state->bounce, &ls); float3 tp = throughput; /* sample position on volume segment */ - float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE); - float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE); + float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL); + float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); - (void)result; - kernel_assert(result == VOLUME_PATH_SCATTERED); - /* todo: split up light_sample so we don't have to call it again with new position */ - if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { + if(result == VOLUME_PATH_SCATTERED && + light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { if(kernel_data.integrator.num_all_lights) ls.pdf *= 2.0f; - float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples); + float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } @@ -242,34 +229,31 @@ ccl_device void kernel_branched_path_volume_connect_light( } else { /* sample random position on random light */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); LightSample ls; - light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls); + light_sample(kg, light_u, light_v, sd->time, ray->P, state->bounce, &ls); float3 tp = throughput; /* sample position on volume segment */ - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); + float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); - (void)result; - kernel_assert(result == VOLUME_PATH_SCATTERED); - /* todo: split up light_sample so we don't have to call it again with new position */ - if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { + if(result == VOLUME_PATH_SCATTERED && + light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { /* sample random light */ - float terminate = path_state_rng_light_termination(kg, rng, state); + float terminate = path_state_rng_light_termination(kg, state); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp); } diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index e8a912ccc0b..11798d87cb5 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -18,55 +18,18 @@ CCL_NAMESPACE_BEGIN -#ifdef __SOBOL__ - -/* Skip initial numbers that are not as well distributed, especially the - * first sequence is just 0 everywhere, which can be problematic for e.g. - * path termination. - */ -#define SOBOL_SKIP 64 +/* Pseudo random numbers, uncomment this for debugging correlations. Only run + * this single threaded on a CPU for repeatable resutls. */ +//#define __DEBUG_CORRELATION__ -/* High Dimensional Sobol. */ -/* Van der Corput radical inverse. */ -ccl_device uint van_der_corput(uint bits) -{ - bits = (bits << 16) | (bits >> 16); - bits = ((bits & 0x00ff00ff) << 8) | ((bits & 0xff00ff00) >> 8); - bits = ((bits & 0x0f0f0f0f) << 4) | ((bits & 0xf0f0f0f0) >> 4); - bits = ((bits & 0x33333333) << 2) | ((bits & 0xcccccccc) >> 2); - bits = ((bits & 0x55555555) << 1) | ((bits & 0xaaaaaaaa) >> 1); - return bits; -} - -/* Sobol radical inverse. */ -ccl_device uint sobol(uint i) -{ - uint r = 0; - for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1) { - if(i & 1) { - r ^= v; - } - } - return r; -} +/* High Dimensional Sobol. + * + * Multidimensional sobol with generator matrices. Dimension 0 and 1 are equal + * to classic Van der Corput and Sobol sequences. */ -/* Inverse of sobol radical inverse. */ -ccl_device uint sobol_inverse(uint i) -{ - const uint msb = 1U << 31; - uint r = 0; - for(uint v = 1; i; i <<= 1, v ^= v << 1) { - if(i & msb) { - r ^= v; - } - } - return r; -} +#ifdef __SOBOL__ -/* Multidimensional sobol with generator matrices - * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively. - */ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension) { uint result = 0; @@ -79,51 +42,32 @@ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension) return result; } -/* Lookup index and x/y coordinate, assumes m is a power of two. */ -ccl_device uint sobol_lookup(const uint m, - const uint frame, - const uint ex, - const uint ey, - uint *x, uint *y) -{ - /* Shift is constant per frame. */ - const uint shift = frame << (m << 1); - const uint sobol_shift = sobol(shift); - /* Van der Corput is its own inverse. */ - const uint lower = van_der_corput(ex << (32 - m)); - /* Need to compensate for ey difference and shift. */ - const uint sobol_lower = sobol(lower); - const uint mask = ~-(1 << m) << (32 - m); /* Only m upper bits. */ - const uint delta = ((ey << (32 - m)) ^ sobol_lower ^ sobol_shift) & mask; - /* Only use m upper bits for the index (m is a power of two). */ - const uint sobol_result = delta | (delta >> m); - const uint upper = sobol_inverse(sobol_result); - const uint index = shift | upper | lower; - *x = van_der_corput(index); - *y = sobol_shift ^ sobol_result ^ sobol_lower; - return index; -} +#endif /* __SOBOL__ */ + ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, - RNG *rng, + uint rng_hash, int sample, int num_samples, int dimension) { +#ifdef __DEBUG_CORRELATION__ + return (float)drand48(); +#endif + #ifdef __CMJ__ - if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { +# ifdef __SOBOL__ + if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) +# endif + { /* Correlated multi-jitter. */ - int p = *rng + dimension; + int p = rng_hash + dimension; return cmj_sample_1D(sample, num_samples, p); } #endif -#ifdef __SOBOL_FULL_SCREEN__ - uint result = sobol_dimension(kg, *rng, dimension); - float r = (float)result * (1.0f/(float)0xFFFFFFFF); - return r; -#else - /* Compute sobol sequence value using direction vectors. */ - uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension); +#ifdef __SOBOL__ + /* Sobol sequence value using direction vectors. */ + uint result = sobol_dimension(kg, sample, dimension); float r = (float)result * (1.0f/(float)0xFFFFFFFF); /* Cranly-Patterson rotation using rng seed */ @@ -132,7 +76,7 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, /* Hash rng with dimension to solve correlation issues. * See T38710, T50116. */ - RNG tmp_rng = cmj_hash_simple(dimension, *rng); + uint tmp_rng = cmj_hash_simple(dimension, rng_hash); shift = tmp_rng * (1.0f/(float)0xFFFFFFFF); return r + shift - floorf(r + shift); @@ -140,128 +84,60 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, } ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, - RNG *rng, + uint rng_hash, int sample, int num_samples, int dimension, float *fx, float *fy) { +#ifdef __DEBUG_CORRELATION__ + *fx = (float)drand48(); + *fy = (float)drand48(); + return; +#endif + #ifdef __CMJ__ - if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { +# ifdef __SOBOL__ + if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) +# endif + { /* Correlated multi-jitter. */ - int p = *rng + dimension; + int p = rng_hash + dimension; cmj_sample_2D(sample, num_samples, p, fx, fy); + return; } - else #endif - { - /* Sobol. */ - *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); - *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); - } + +#ifdef __SOBOL__ + /* Sobol. */ + *fx = path_rng_1D(kg, rng_hash, sample, num_samples, dimension); + *fy = path_rng_1D(kg, rng_hash, sample, num_samples, dimension + 1); +#endif } ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, - RNG *rng, + uint *rng_hash, int x, int y, float *fx, float *fy) { -#ifdef __SOBOL_FULL_SCREEN__ - uint px, py; - uint bits = 16; /* limits us to 65536x65536 and 65536 samples */ - uint size = 1 << bits; - uint frame = sample; - - *rng = sobol_lookup(bits, frame, x, y, &px, &py); - - *rng ^= kernel_data.integrator.seed; - - if(sample == 0) { - *fx = 0.5f; - *fy = 0.5f; - } - else { - *fx = size * (float)px * (1.0f/(float)0xFFFFFFFF) - x; - *fy = size * (float)py * (1.0f/(float)0xFFFFFFFF) - y; - } -#else - *rng = *rng_state; - - *rng ^= kernel_data.integrator.seed; - - if(sample == 0) { - *fx = 0.5f; - *fy = 0.5f; - } - else { - path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy); - } -#endif -} - -ccl_device void path_rng_end(KernelGlobals *kg, - ccl_global uint *rng_state, - RNG rng) -{ - /* nothing to do */ -} - -#else /* __SOBOL__ */ - -/* Linear Congruential Generator */ - -ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, - RNG *rng, - int sample, int num_samples, - int dimension) -{ - /* implicit mod 2^32 */ - *rng = (1103515245*(*rng) + 12345); - return (float)*rng * (1.0f/(float)0xFFFFFFFF); -} - -ccl_device_inline void path_rng_2D(KernelGlobals *kg, - RNG *rng, - int sample, int num_samples, - int dimension, - float *fx, float *fy) -{ - *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); - *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); -} - -ccl_device void path_rng_init(KernelGlobals *kg, - ccl_global uint *rng_state, - int sample, int num_samples, - RNG *rng, - int x, int y, - float *fx, float *fy) -{ /* load state */ - *rng = *rng_state; + *rng_hash = *rng_state; + *rng_hash ^= kernel_data.integrator.seed; - *rng ^= kernel_data.integrator.seed; +#ifdef __DEBUG_CORRELATION__ + srand48(*rng_hash + sample); +#endif if(sample == 0) { *fx = 0.5f; *fy = 0.5f; } else { - path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy); + path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_FILTER_U, fx, fy); } } -ccl_device void path_rng_end(KernelGlobals *kg, - ccl_global uint *rng_state, - RNG rng) -{ - /* store state for next sample */ - *rng_state = rng; -} - -#endif /* __SOBOL__ */ - /* Linear Congruential Generator */ ccl_device uint lcg_step_uint(uint *rng) @@ -295,44 +171,22 @@ ccl_device uint lcg_init(uint seed) */ ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, - RNG *rng, const ccl_addr_space PathState *state, int dimension) { return path_rng_1D(kg, - rng, + state->rng_hash, state->sample, state->num_samples, state->rng_offset + dimension); } -ccl_device_inline float path_state_rng_1D_for_decision( - KernelGlobals *kg, - RNG *rng, - const ccl_addr_space PathState *state, - int dimension) -{ - /* The rng_offset is not increased for transparent bounces. if we do then - * fully transparent objects can become subtly visible by the different - * sampling patterns used where the transparent object is. - * - * however for some random numbers that will determine if we next bounce - * is transparent we do need to increase the offset to avoid always making - * the same decision. */ - const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM; - return path_rng_1D(kg, - rng, - state->sample, state->num_samples, - rng_offset + dimension); -} - ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, - RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy) { path_rng_2D(kg, - rng, + state->rng_hash, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy); @@ -340,38 +194,22 @@ ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_device_inline float path_branched_rng_1D( KernelGlobals *kg, - RNG *rng, + uint rng_hash, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension) { return path_rng_1D(kg, - rng, + rng_hash, state->sample * num_branches + branch, state->num_samples * num_branches, state->rng_offset + dimension); } -ccl_device_inline float path_branched_rng_1D_for_decision( - KernelGlobals *kg, - RNG *rng, - const ccl_addr_space PathState *state, - int branch, - int num_branches, - int dimension) -{ - const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM; - return path_rng_1D(kg, - rng, - state->sample * num_branches + branch, - state->num_samples * num_branches, - rng_offset + dimension); -} - ccl_device_inline void path_branched_rng_2D( KernelGlobals *kg, - RNG *rng, + uint rng_hash, const ccl_addr_space PathState *state, int branch, int num_branches, @@ -379,7 +217,7 @@ ccl_device_inline void path_branched_rng_2D( float *fx, float *fy) { path_rng_2D(kg, - rng, + rng_hash, state->sample * num_branches + branch, state->num_samples * num_branches, state->rng_offset + dimension, @@ -391,52 +229,45 @@ ccl_device_inline void path_branched_rng_2D( */ ccl_device_inline float path_state_rng_light_termination( KernelGlobals *kg, - RNG *rng, const ccl_addr_space PathState *state) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE); + return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE); } return 0.0f; } ccl_device_inline float path_branched_rng_light_termination( KernelGlobals *kg, - RNG *rng, + uint rng_hash, const ccl_addr_space PathState *state, int branch, int num_branches) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - return path_branched_rng_1D_for_decision(kg, - rng, - state, - branch, - num_branches, - PRNG_LIGHT_TERMINATE); + return path_branched_rng_1D(kg, + rng_hash, + state, + branch, + num_branches, + PRNG_LIGHT_TERMINATE); } return 0.0f; } -ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, - int branch, - int num_branches) +ccl_device_inline uint lcg_state_init(PathState *state, + uint scramble) { - /* path is splitting into a branch, adjust so that each branch - * still gets a unique sample from the same sequence */ - state->rng_offset += PRNG_BOUNCE_NUM; - state->sample = state->sample*num_branches + branch; - state->num_samples = state->num_samples*num_branches; + return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble); } -ccl_device_inline uint lcg_state_init(RNG *rng, - int rng_offset, - int sample, - uint scramble) +ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space PathState *state, + uint scramble) { - return lcg_init(*rng + rng_offset + sample*scramble); + return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble); } + ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng) { /* Implicit mod 2^32 */ diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index c66f52255f0..eeb4eb0097f 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -66,8 +66,8 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, /* matrices and time */ #ifdef __OBJECT_MOTION__ shader_setup_object_transforms(kg, sd, ray->time); - sd->time = ray->time; #endif + sd->time = ray->time; sd->prim = kernel_tex_fetch(__prim_index, isect->prim); sd->ray_length = isect->t; @@ -83,7 +83,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, float4 curvedata = kernel_tex_fetch(__curves, sd->prim); sd->shader = __float_as_int(curvedata.z); - sd->P = bvh_curve_refine(kg, sd, isect, ray); + sd->P = curve_refine(kg, sd, isect, ray); } else #endif @@ -271,17 +271,17 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, sd->u = u; sd->v = v; #endif + sd->time = time; sd->ray_length = t; sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE); sd->object_flag = 0; if(sd->object != OBJECT_NONE) { sd->object_flag |= kernel_tex_fetch(__object_flag, - sd->object); + sd->object); #ifdef __OBJECT_MOTION__ shader_setup_object_transforms(kg, sd, time); - sd->time = time; } else if(lamp != LAMP_NONE) { sd->ob_tfm = lamp_fetch_transform(kg, lamp, false); @@ -385,9 +385,7 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderDat sd->shader = kernel_data.background.surface_shader; sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE); sd->object_flag = 0; -#ifdef __OBJECT_MOTION__ sd->time = ray->time; -#endif sd->ray_length = 0.0f; #ifdef __INSTANCING__ @@ -427,9 +425,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s sd->shader = SHADER_NONE; sd->flag = 0; sd->object_flag = 0; -#ifdef __OBJECT_MOTION__ sd->time = ray->time; -#endif sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */ #ifdef __INSTANCING__ @@ -498,20 +494,45 @@ ccl_device_inline void shader_merge_closures(ShaderData *sd) } #endif +/* Defensive sampling. */ + +ccl_device_inline void shader_prepare_closures(ShaderData *sd, + ccl_addr_space PathState *state) +{ + /* We can likely also do defensive sampling at deeper bounces, particularly + * for cases like a perfect mirror but possibly also others. This will need + * a good heuristic. */ + if(state->bounce + state->transparent_bounce == 0 && sd->num_closure > 1) { + float sum = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { + sum += sc->sample_weight; + } + } + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { + sc->sample_weight = max(sc->sample_weight, 0.125f * sum); + } + } + } +} + + /* BSDF */ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, ShaderData *sd, const float3 omega_in, float *pdf, - int skip_bsdf, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight) + const ShaderClosure *skip_sc, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight) { /* this is the veach one-sample model with balance heuristic, some pdf * factors drop out when using balance heuristic weighting */ for(int i = 0; i < sd->num_closure; i++) { - if(i == skip_bsdf) - continue; - const ShaderClosure *sc = &sd->closure[i]; - if(CLOSURE_IS_BSDF(sc->type)) { + if(sc != skip_sc && CLOSURE_IS_BSDF(sc->type)) { float bsdf_pdf = 0.0f; float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf); @@ -574,7 +595,7 @@ void shader_bsdf_eval(KernelGlobals *kg, #endif { float pdf; - _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, -1, eval, 0.0f, 0.0f); + _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, NULL, eval, 0.0f, 0.0f); if(use_mis) { float weight = power_heuristic(light_pdf, pdf); bsdf_eval_mis(eval, weight); @@ -582,48 +603,120 @@ void shader_bsdf_eval(KernelGlobals *kg, } } -ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, - ShaderData *sd, - float randu, float randv, - BsdfEval *bsdf_eval, - float3 *omega_in, - differential3 *domega_in, - float *pdf) +ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, + float *randu) { int sampled = 0; if(sd->num_closure > 1) { - /* pick a BSDF closure based on sample weights */ + /* Pick a BSDF or based on sample weights. */ float sum = 0.0f; - for(sampled = 0; sampled < sd->num_closure; sampled++) { - const ShaderClosure *sc = &sd->closure[sampled]; - - if(CLOSURE_IS_BSDF(sc->type)) + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(CLOSURE_IS_BSDF(sc->type)) { sum += sc->sample_weight; + } } - float r = sd->randb_closure*sum; - sum = 0.0f; + float r = (*randu)*sum; + float partial_sum = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; - for(sampled = 0; sampled < sd->num_closure; sampled++) { - const ShaderClosure *sc = &sd->closure[sampled]; - if(CLOSURE_IS_BSDF(sc->type)) { - sum += sc->sample_weight; + float next_sum = partial_sum + sc->sample_weight; + + if(r < next_sum) { + sampled = i; - if(r <= sum) + /* Rescale to reuse for direction sample, to better + * preserve stratifaction. */ + *randu = (r - partial_sum) / sc->sample_weight; break; + } + + partial_sum = next_sum; } } + } - if(sampled == sd->num_closure) { - *pdf = 0.0f; - return LABEL_NONE; + return &sd->closure[sampled]; +} + +ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd, + ccl_addr_space float3 *throughput, + float *randu) +{ + int sampled = 0; + + if(sd->num_closure > 1) { + /* Pick a BSDF or BSSRDF or based on sample weights. */ + float sum_bsdf = 0.0f; + float sum_bssrdf = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(CLOSURE_IS_BSDF(sc->type)) { + sum_bsdf += sc->sample_weight; + } + else if(CLOSURE_IS_BSSRDF(sc->type)) { + sum_bssrdf += sc->sample_weight; + } + } + + float r = (*randu)*(sum_bsdf + sum_bssrdf); + float partial_sum = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { + float next_sum = partial_sum + sc->sample_weight; + + if(r < next_sum) { + if(CLOSURE_IS_BSDF(sc->type)) { + *throughput *= (sum_bsdf + sum_bssrdf) / sum_bsdf; + return NULL; + } + else { + *throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf; + sampled = i; + + /* Rescale to reuse for direction sample, to better + * preserve stratifaction. */ + *randu = (r - partial_sum) / sc->sample_weight; + break; + } + } + + partial_sum = next_sum; + } } } - const ShaderClosure *sc = &sd->closure[sampled]; + return &sd->closure[sampled]; +} + +ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, + ShaderData *sd, + float randu, float randv, + BsdfEval *bsdf_eval, + float3 *omega_in, + differential3 *domega_in, + float *pdf) +{ + const ShaderClosure *sc = shader_bsdf_pick(sd, &randu); + if(sc == NULL) { + *pdf = 0.0f; + return LABEL_NONE; + } + + /* BSSRDF should already have been handled elsewhere. */ + kernel_assert(CLOSURE_IS_BSDF(sc->type)); int label; float3 eval; @@ -636,7 +729,7 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, if(sd->num_closure > 1) { float sweight = sc->sample_weight; - _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight); + _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sc, bsdf_eval, *pdf*sweight, sweight); } } @@ -669,7 +762,7 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn } } -ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd) +ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd) { if(sd->flag & SD_HAS_ONLY_VOLUME) return make_float3(1.0f, 1.0f, 1.0f); @@ -677,7 +770,7 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd) float3 eval = make_float3(0.0f, 0.0f, 0.0f); for(int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + const ShaderClosure *sc = &sd->closure[i]; if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl eval += sc->weight; @@ -764,6 +857,19 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd) return eval; } +ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd) +{ + float3 N = make_float3(0.0f, 0.0f, 0.0f); + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) + N += sc->N*average(sc->weight); + } + + return (is_zero(N))? sd->N : normalize(N); +} + ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); @@ -783,12 +889,7 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac } } - if(is_zero(N)) - N = sd->N; - else - N = normalize(N); - - *N_ = N; + *N_ = (is_zero(N))? sd->N : normalize(N); return eval; } @@ -863,16 +964,15 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) /* Surface Evaluation */ -ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, RNG *rng, - ccl_addr_space PathState *state, float randb, int path_flag, ShaderContext ctx) +ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, + ccl_addr_space PathState *state, int path_flag) { sd->num_closure = 0; sd->num_closure_extra = 0; - sd->randb_closure = randb; #ifdef __OSL__ if(kg->osl) - OSLShader::eval_surface(kg, sd, state, path_flag, ctx); + OSLShader::eval_surface(kg, sd, state, path_flag); else #endif { @@ -887,24 +987,23 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, RNG *rng, #endif } - if(rng && (sd->flag & SD_BSDF_NEEDS_LCG)) { - sd->lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0xb4bc3953); + if(sd->flag & SD_BSDF_NEEDS_LCG) { + sd->lcg_state = lcg_state_init_addrspace(state, 0xb4bc3953); } } /* Background Evaluation */ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, - ccl_addr_space PathState *state, int path_flag, ShaderContext ctx) + ccl_addr_space PathState *state, int path_flag) { sd->num_closure = 0; sd->num_closure_extra = 0; - sd->randb_closure = 0.0f; #ifdef __SVM__ #ifdef __OSL__ if(kg->osl) { - OSLShader::eval_background(kg, sd, state, path_flag, ctx); + OSLShader::eval_background(kg, sd, state, path_flag); } else #endif @@ -981,17 +1080,22 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg, const ShaderData *s sum += sc->sample_weight; } - float r = sd->randb_closure*sum; - sum = 0.0f; + float r = randu*sum; + float partial_sum = 0.0f; for(sampled = 0; sampled < sd->num_closure; sampled++) { const ShaderClosure *sc = &sd->closure[sampled]; if(CLOSURE_IS_PHASE(sc->type)) { - sum += sc->sample_weight; + float next_sum = partial_sum + sc->sample_weight; - if(r <= sum) + if(r <= next_sum) { + /* Rescale to reuse for BSDF direction sample. */ + randu = (r - partial_sum) / sc->sample_weight; break; + } + + partial_sum = next_sum; } } @@ -1039,8 +1143,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ccl_addr_space VolumeStack *stack, - int path_flag, - ShaderContext ctx) + int path_flag) { /* reset closures once at the start, we will be accumulating the closures * for all volumes in the stack into a single array of closures */ @@ -1073,7 +1176,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, #ifdef __SVM__ # ifdef __OSL__ if(kg->osl) { - OSLShader::eval_volume(kg, sd, state, path_flag, ctx); + OSLShader::eval_volume(kg, sd, state, path_flag); } else # endif @@ -1092,17 +1195,16 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, /* Displacement Evaluation */ -ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderContext ctx) +ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state) { sd->num_closure = 0; sd->num_closure_extra = 0; - sd->randb_closure = 0.0f; /* this will modify sd->P */ #ifdef __SVM__ # ifdef __OSL__ if(kg->osl) - OSLShader::eval_displacement(kg, sd, ctx); + OSLShader::eval_displacement(kg, sd); else # endif { diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index fab5946970d..8a0da6c3b13 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -16,6 +16,42 @@ CCL_NAMESPACE_BEGIN +#ifdef __VOLUME__ +typedef struct VolumeState { +# ifdef __SPLIT_KERNEL__ +# else + PathState ps; +# endif +} VolumeState; + +/* Get PathState ready for use for volume stack evaluation. */ +# ifdef __SPLIT_KERNEL__ +ccl_addr_space +# endif +ccl_device_inline PathState *shadow_blocked_volume_path_state( + KernelGlobals *kg, + VolumeState *volume_state, + ccl_addr_space PathState *state, + ShaderData *sd, + Ray *ray) +{ +# ifdef __SPLIT_KERNEL__ + ccl_addr_space PathState *ps = + &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)]; +# else + PathState *ps = &volume_state->ps; +# endif + *ps = *state; + /* We are checking for shadow on the "other" side of the surface, so need + * to discard volume we are currently at. + */ + if(dot(sd->Ng, ray->D) < 0.0f) { + kernel_volume_stack_enter_exit(kg, sd, ps->volume_stack); + } + return ps; +} +#endif /* __VOLUME__ */ + /* Attenuate throughput accordingly to the given intersection event. * Returns true if the throughput is zero and traversal can be aborted. */ @@ -49,11 +85,8 @@ ccl_device_forceinline bool shadow_handle_transparent_isect( path_state_modify_bounce(state, true); shader_eval_surface(kg, shadow_sd, - NULL, state, - 0.0f, - PATH_RAY_SHADOW, - SHADER_CONTEXT_SHADOW); + PATH_RAY_SHADOW); path_state_modify_bounce(state, false); *throughput *= shader_bsdf_transparency(kg, shadow_sd); } @@ -72,13 +105,14 @@ ccl_device_forceinline bool shadow_handle_transparent_isect( ccl_device bool shadow_blocked_opaque(KernelGlobals *kg, ShaderData *shadow_sd, ccl_addr_space PathState *state, + const uint visibility, Ray *ray, Intersection *isect, float3 *shadow) { const bool blocked = scene_intersect(kg, *ray, - PATH_RAY_SHADOW_OPAQUE, + visibility & PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f); @@ -126,9 +160,10 @@ ccl_device bool shadow_blocked_opaque(KernelGlobals *kg, * Note that hits array should be as big as max_hits+1. */ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, + ShaderData *sd, ShaderData *shadow_sd, ccl_addr_space PathState *state, - const int skip_object, + const uint visibility, Ray *ray, Intersection *hits, uint max_hits, @@ -141,9 +176,12 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, const bool blocked = scene_intersect_shadow_all(kg, ray, hits, - skip_object, + visibility, max_hits, &num_hits); +# ifdef __VOLUME__ + VolumeState volume_state; +# endif /* If no opaque surface found but we did find transparent hits, * shade them. */ @@ -155,12 +193,13 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, Intersection *isect = hits; # ifdef __VOLUME__ # ifdef __SPLIT_KERNEL__ - ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)]; -# else - PathState ps_object; - PathState *ps = &ps_object; + ccl_addr_space # endif - *ps = *state; + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); # endif sort_intersections(hits, num_hits); for(int hit = 0; hit < num_hits; hit++, isect++) { @@ -205,8 +244,16 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, } # ifdef __VOLUME__ if(!blocked && state->volume_stack[0].shader != SHADER_NONE) { - /* Apply attenuation from current volume shader/ */ - kernel_volume_shadow(kg, shadow_sd, state, ray, shadow); + /* Apply attenuation from current volume shader. */ +# ifdef __SPLIT_KERNEL__ + ccl_addr_space +# endif + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); + kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow); } # endif return blocked; @@ -216,9 +263,10 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, * loop to help readability of the actual logic. */ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg, + ShaderData *sd, ShaderData *shadow_sd, ccl_addr_space PathState *state, - const int skip_object, + const uint visibility, Ray *ray, uint max_hits, float3 *shadow) @@ -251,9 +299,10 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg, # endif /* __KERNEL_GPU__ */ /* Invoke actual traversal. */ return shadow_blocked_transparent_all_loop(kg, + sd, shadow_sd, state, - skip_object, + visibility, ray, hits, max_hits, @@ -276,27 +325,32 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg, */ ccl_device bool shadow_blocked_transparent_stepped_loop( KernelGlobals *kg, + ShaderData *sd, ShaderData *shadow_sd, ccl_addr_space PathState *state, - const int skip_object, + const uint visibility, Ray *ray, Intersection *isect, const bool blocked, const bool is_transparent_isect, float3 *shadow) { - if((blocked && is_transparent_isect) || skip_object != OBJECT_NONE) { +# ifdef __VOLUME__ + VolumeState volume_state; +# endif + if(blocked && is_transparent_isect) { float3 throughput = make_float3(1.0f, 1.0f, 1.0f); float3 Pend = ray->P + ray->D*ray->t; int bounce = state->transparent_bounce; # ifdef __VOLUME__ # ifdef __SPLIT_KERNEL__ - ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)]; -# else - PathState ps_object; - PathState *ps = &ps_object; + ccl_addr_space # endif - *ps = *state; + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); # endif for(;;) { if(bounce >= kernel_data.integrator.transparent_max_bounce) { @@ -304,30 +358,13 @@ ccl_device bool shadow_blocked_transparent_stepped_loop( } if(!scene_intersect(kg, *ray, - PATH_RAY_SHADOW_TRANSPARENT, + visibility & PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f)) { break; } -#ifdef __SHADOW_TRICKS__ - if(skip_object != OBJECT_NONE) { - const int isect_object = (isect->object == PRIM_NONE) - ? kernel_tex_fetch(__prim_object, isect->prim) - : isect->object; - if(isect_object == skip_object) { - shader_setup_from_ray(kg, shadow_sd, isect, ray); - /* Move ray forward. */ - ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng); - if(ray->t != FLT_MAX) { - ray->D = normalize_len(Pend - ray->P, &ray->t); - } - bounce++; - continue; - } - } -#endif if(!shader_transparent_shadow(kg, isect)) { return true; } @@ -363,7 +400,15 @@ ccl_device bool shadow_blocked_transparent_stepped_loop( # ifdef __VOLUME__ if(!blocked && state->volume_stack[0].shader != SHADER_NONE) { /* Apply attenuation from current volume shader. */ - kernel_volume_shadow(kg, shadow_sd, state, ray, shadow); +# ifdef __SPLIT_KERNEL__ + ccl_addr_space +# endif + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); + kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow); } # endif return blocked; @@ -371,33 +416,28 @@ ccl_device bool shadow_blocked_transparent_stepped_loop( ccl_device bool shadow_blocked_transparent_stepped( KernelGlobals *kg, + ShaderData *sd, ShaderData *shadow_sd, ccl_addr_space PathState *state, - const int skip_object, + const uint visibility, Ray *ray, Intersection *isect, float3 *shadow) { - bool blocked, is_transparent_isect; - if(skip_object == OBJECT_NONE) { - blocked = scene_intersect(kg, - *ray, - PATH_RAY_SHADOW_OPAQUE, - isect, - NULL, - 0.0f, 0.0f); - is_transparent_isect = blocked - ? shader_transparent_shadow(kg, isect) - : false; - } - else { - blocked = false; - is_transparent_isect = false; - } + bool blocked = scene_intersect(kg, + *ray, + visibility & PATH_RAY_SHADOW_OPAQUE, + isect, + NULL, + 0.0f, 0.0f); + bool is_transparent_isect = blocked + ? shader_transparent_shadow(kg, isect) + : false; return shadow_blocked_transparent_stepped_loop(kg, + sd, shadow_sd, state, - skip_object, + visibility, ray, isect, blocked, @@ -409,6 +449,7 @@ ccl_device bool shadow_blocked_transparent_stepped( #endif /* __TRANSPARENT_SHADOWS__ */ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, + ShaderData *sd, ShaderData *shadow_sd, ccl_addr_space PathState *state, Ray *ray_input, @@ -422,25 +463,24 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, return false; } #ifdef __SHADOW_TRICKS__ - const int skip_object = state->catcher_object; + const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER) + ? PATH_RAY_SHADOW_NON_CATCHER + : PATH_RAY_SHADOW; #else - const int skip_object = OBJECT_NONE; + const uint visibility = PATH_RAY_SHADOW; #endif /* Do actual shadow shading. */ /* First of all, we check if integrator requires transparent shadows. * if not, we use simplest and fastest ever way to calculate occlusion. - * - * NOTE: We can't do quick opaque test here if we are on shadow-catcher - * path because we don't want catcher object to be casting shadow here. */ #ifdef __TRANSPARENT_SHADOWS__ - if(!kernel_data.integrator.transparent_shadows && - skip_object == OBJECT_NONE) + if(!kernel_data.integrator.transparent_shadows) #endif { return shadow_blocked_opaque(kg, shadow_sd, state, + visibility, ray, &isect, shadow); @@ -467,7 +507,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, */ const bool blocked = scene_intersect(kg, *ray, - PATH_RAY_SHADOW_OPAQUE, + visibility & PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f); @@ -478,9 +518,10 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, max_hits + 1 >= SHADOW_STACK_MAX_HITS) { return shadow_blocked_transparent_stepped_loop(kg, + sd, shadow_sd, state, - skip_object, + visibility, ray, &isect, blocked, @@ -489,18 +530,20 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, } # endif /* __KERNEL_GPU__ */ return shadow_blocked_transparent_all(kg, + sd, shadow_sd, state, - skip_object, + visibility, ray, max_hits, shadow); # else /* __SHADOW_RECORD_ALL__ */ /* Fallback to a slowest version which works on all devices. */ return shadow_blocked_transparent_stepped(kg, + sd, shadow_sd, state, - skip_object, + visibility, ray, &isect, shadow); diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h index 6475d4b66fd..23a09e5e2ca 100644 --- a/intern/cycles/kernel/kernel_subsurface.h +++ b/intern/cycles/kernel/kernel_subsurface.h @@ -28,87 +28,31 @@ CCL_NAMESPACE_BEGIN * - try to reduce one sample model variance */ -#define BSSRDF_MULTI_EVAL - -ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, ShaderData *sd, float *probability) -{ - /* sum sample weights of bssrdf and bsdf */ - float bsdf_sum = 0.0f; - float bssrdf_sum = 0.0f; - - for(int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; - - if(CLOSURE_IS_BSDF(sc->type)) - bsdf_sum += sc->sample_weight; - else if(CLOSURE_IS_BSSRDF(sc->type)) - bssrdf_sum += sc->sample_weight; - } - - /* use bsdf or bssrdf? */ - float r = sd->randb_closure*(bsdf_sum + bssrdf_sum); - - if(r < bsdf_sum) { - /* use bsdf, and adjust randb so we can reuse it for picking a bsdf */ - sd->randb_closure = r/bsdf_sum; - *probability = (bsdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bsdf_sum: 1.0f; - return NULL; - } - - /* use bssrdf */ - r -= bsdf_sum; - - float sum = 0.0f; - - for(int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; - - if(CLOSURE_IS_BSSRDF(sc->type)) { - sum += sc->sample_weight; - - if(r <= sum) { - sd->randb_closure = (r - (sum - sc->sample_weight))/sc->sample_weight; - -#ifdef BSSRDF_MULTI_EVAL - *probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bssrdf_sum: 1.0f; -#else - *probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/sc->sample_weight: 1.0f; -#endif - return sc; - } - } - } - - /* should never happen */ - sd->randb_closure = 0.0f; - *probability = 1.0f; - return NULL; -} - ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd, - ShaderClosure *sc, + const ShaderClosure *sc, float disk_r, float r, bool all) { -#ifdef BSSRDF_MULTI_EVAL /* this is the veach one-sample model with balance heuristic, some pdf * factors drop out when using balance heuristic weighting */ float3 eval_sum = make_float3(0.0f, 0.0f, 0.0f); float pdf_sum = 0.0f; - float sample_weight_sum = 0.0f; - int num_bssrdf = 0; + float sample_weight_inv = 0.0f; - for(int i = 0; i < sd->num_closure; i++) { - sc = &sd->closure[i]; - - if(CLOSURE_IS_BSSRDF(sc->type)) { - float sample_weight = (all)? 1.0f: sc->sample_weight; - sample_weight_sum += sample_weight; + if(!all) { + float sample_weight_sum = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + sc = &sd->closure[i]; + + if(CLOSURE_IS_BSSRDF(sc->type)) { + sample_weight_sum += sc->sample_weight; + } } - } - float sample_weight_inv = 1.0f/sample_weight_sum; + sample_weight_inv = 1.0f/sample_weight_sum; + } for(int i = 0; i < sd->num_closure; i++) { sc = &sd->closure[i]; @@ -125,25 +69,16 @@ ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd, /* TODO power heuristic is not working correct here */ eval_sum += sc->weight*pdf; //*sample_weight*disk_pdf; pdf_sum += sample_weight*disk_pdf; //*sample_weight*disk_pdf; - - num_bssrdf++; } } return (pdf_sum > 0.0f)? eval_sum / pdf_sum : make_float3(0.0f, 0.0f, 0.0f); -#else - float pdf = bssrdf_pdf(pick_sc, r); - float disk_pdf = bssrdf_pdf(pick_sc, disk_r); - - return pick_sc->weight * pdf / disk_pdf; -#endif } /* replace closures with a single diffuse bsdf closure after scatter step */ -ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, ShaderClosure *sc, float3 weight, bool hit, float3 N) +ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, const ShaderClosure *sc, float3 weight, bool hit, float3 N) { sd->flag &= ~SD_CLOSURE_FLAGS; - sd->randb_closure = 0.0f; sd->num_closure = 0; sd->num_closure_extra = 0; @@ -219,7 +154,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, if(bump || texture_blur > 0.0f) { /* average color and normal at incoming point */ - shader_eval_surface(kg, sd, NULL, state, 0.0f, state_flag, SHADER_CONTEXT_SSS); + shader_eval_surface(kg, sd, state, state_flag); float3 in_color = shader_bssrdf_sum(sd, (bump)? N: NULL, NULL); /* we simply divide out the average color and multiply with the average @@ -242,8 +177,8 @@ ccl_device_inline int subsurface_scatter_multi_intersect( KernelGlobals *kg, SubsurfaceIntersection *ss_isect, ShaderData *sd, - ShaderClosure *sc, - RNG *lcg_state, + const ShaderClosure *sc, + uint *lcg_state, float disk_u, float disk_v, bool all) @@ -255,26 +190,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect( disk_N = sd->Ng; make_orthonormals(disk_N, &disk_T, &disk_B); - /* reusing variable for picking the closure gives a bit nicer stratification - * for path tracer, for branched we do all closures so it doesn't help */ - float axisu = (all)? disk_u: sd->randb_closure; - - if(axisu < 0.5f) { + if(disk_u < 0.5f) { pick_pdf_N = 0.5f; pick_pdf_T = 0.25f; pick_pdf_B = 0.25f; - if(all) - disk_u *= 2.0f; + disk_u *= 2.0f; } - else if(axisu < 0.75f) { + else if(disk_u < 0.75f) { float3 tmp = disk_N; disk_N = disk_T; disk_T = tmp; pick_pdf_N = 0.25f; pick_pdf_T = 0.5f; pick_pdf_B = 0.25f; - if(all) - disk_u = (disk_u - 0.5f)*4.0f; + disk_u = (disk_u - 0.5f)*4.0f; } else { float3 tmp = disk_N; @@ -283,8 +212,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect( pick_pdf_N = 0.25f; pick_pdf_T = 0.25f; pick_pdf_B = 0.5f; - if(all) - disk_u = (disk_u - 0.75f)*4.0f; + disk_u = (disk_u - 0.75f)*4.0f; } /* sample point on disk */ @@ -390,7 +318,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup( ShaderData *sd, ccl_addr_space PathState *state, int state_flag, - ShaderClosure *sc, + const ShaderClosure *sc, bool all) { #ifdef __SPLIT_KERNEL__ @@ -419,7 +347,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup( /* subsurface scattering step, from a point on the surface to another nearby point on the same object */ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, - int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all) + int state_flag, const ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); @@ -430,18 +358,20 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_a disk_N = sd->Ng; make_orthonormals(disk_N, &disk_T, &disk_B); - if(sd->randb_closure < 0.5f) { + if(disk_u < 0.5f) { pick_pdf_N = 0.5f; pick_pdf_T = 0.25f; pick_pdf_B = 0.25f; + disk_u *= 2.0f; } - else if(sd->randb_closure < 0.75f) { + else if(disk_u < 0.75f) { float3 tmp = disk_N; disk_N = disk_T; disk_T = tmp; pick_pdf_N = 0.25f; pick_pdf_T = 0.5f; pick_pdf_B = 0.25f; + disk_u = (disk_u - 0.5f)*4.0f; } else { float3 tmp = disk_N; @@ -450,6 +380,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_a pick_pdf_N = 0.25f; pick_pdf_T = 0.25f; pick_pdf_B = 0.5f; + disk_u = (disk_u - 0.75f)*4.0f; } /* sample point on disk */ diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index cb1a3f40dee..5eab28a2953 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -82,115 +82,110 @@ KERNEL_TEX(uint, texture_uint, __sobol_directions) # if __CUDA_ARCH__ < 300 /* full-float image */ KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_002) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_003) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_004) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_008) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_016) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_024) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_032) KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_000) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_001) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_002) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_003) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_004) - -/* image */ -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_005) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_006) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_007) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_008) +KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_008) +KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_016) +KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_024) +KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_032) + +/* image + * These texture names are encoded to their flattened slots as + * ImageManager::type_index_to_flattened_slot() returns them. */ +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_001) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_009) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_010) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_011) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_012) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_013) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_014) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_015) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_016) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_017) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_018) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_019) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_020) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_021) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_022) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_023) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_024) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_025) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_026) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_027) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_028) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_029) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_030) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_031) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_032) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_033) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_034) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_035) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_036) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_037) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_038) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_039) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_040) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_041) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_042) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_043) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_044) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_045) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_046) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_047) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_048) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_049) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_050) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_051) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_052) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_053) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_054) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_055) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_056) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_057) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_058) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_059) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_060) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_061) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_062) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_063) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_064) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_065) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_066) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_067) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_068) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_069) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_070) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_071) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_072) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_073) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_074) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_075) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_076) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_077) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_078) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_079) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_080) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_081) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_082) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_083) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_084) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_153) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_161) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_169) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_177) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_185) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_193) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_201) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_209) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_217) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_225) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_233) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_241) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_249) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_257) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_265) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_273) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_281) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_289) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_297) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_305) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_313) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_321) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_329) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_337) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_345) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_353) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_361) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_369) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_377) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_385) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_393) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_401) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_409) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_417) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_425) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_433) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_441) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_449) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_457) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_465) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_473) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_481) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_489) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_497) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_505) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_513) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_521) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_529) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_537) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_545) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_553) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_561) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_569) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_577) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_585) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_593) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_601) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_609) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_617) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_625) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_633) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_641) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_649) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_657) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_665) # else /* bindless textures */ KERNEL_TEX(uint, texture_uint, __bindless_mapping) -# endif -#endif - -/* packed image (opencl) */ -KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed) -KERNEL_TEX(float4, texture_float4, __tex_image_float4_packed) -KERNEL_TEX(uchar, texture_uchar, __tex_image_byte_packed) -KERNEL_TEX(float, texture_float, __tex_image_float_packed) -KERNEL_TEX(uint4, texture_uint4, __tex_image_packed_info) +# endif /* __CUDA_ARCH__ */ +#endif /* __KERNEL_CUDA__ */ #undef KERNEL_TEX #undef KERNEL_IMAGE_TEX diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 34affab1b9d..6c5b6ca3b2d 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -130,6 +130,7 @@ CCL_NAMESPACE_BEGIN # ifdef __KERNEL_OPENCL_APPLE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ +# define __PRINCIPLED__ # define __CMJ__ /* TODO(sergey): Currently experimental section is ignored here, * this is because megakernel in device_opencl does not support @@ -154,6 +155,7 @@ CCL_NAMESPACE_BEGIN # define __CL_USE_NATIVE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ +# define __PRINCIPLED__ # define __CMJ__ # endif /* __KERNEL_OPENCL_INTEL_CPU__ */ @@ -240,10 +242,6 @@ CCL_NAMESPACE_BEGIN # undef __DENOISING_FEATURES__ #endif -/* Random Numbers */ - -typedef uint RNG; - /* Shader Evaluation */ typedef enum ShaderEvalType { @@ -283,31 +281,21 @@ enum PathTraceDimension { PRNG_FILTER_V = 1, PRNG_LENS_U = 2, PRNG_LENS_V = 3, -#ifdef __CAMERA_MOTION__ PRNG_TIME = 4, PRNG_UNUSED_0 = 5, PRNG_UNUSED_1 = 6, /* for some reason (6, 7) is a bad sobol pattern */ PRNG_UNUSED_2 = 7, /* with a low number of samples (< 64) */ -#endif - PRNG_BASE_NUM = 8, + PRNG_BASE_NUM = 10, PRNG_BSDF_U = 0, PRNG_BSDF_V = 1, - PRNG_BSDF = 2, - PRNG_LIGHT = 3, - PRNG_LIGHT_U = 4, - PRNG_LIGHT_V = 5, - PRNG_LIGHT_TERMINATE = 6, - PRNG_TERMINATE = 7, - -#ifdef __VOLUME__ - PRNG_PHASE_U = 8, - PRNG_PHASE_V = 9, - PRNG_PHASE = 10, - PRNG_SCATTER_DISTANCE = 11, -#endif - - PRNG_BOUNCE_NUM = 12, + PRNG_LIGHT_U = 2, + PRNG_LIGHT_V = 3, + PRNG_LIGHT_TERMINATE = 4, + PRNG_TERMINATE = 5, + PRNG_PHASE_CHANNEL = 6, + PRNG_SCATTER_DISTANCE = 7, + PRNG_BOUNCE_NUM = 8, }; enum SamplingPattern { @@ -328,24 +316,28 @@ enum PathRayFlag { PATH_RAY_SINGULAR = (1 << 5), PATH_RAY_TRANSPARENT = (1 << 6), - PATH_RAY_SHADOW_OPAQUE = (1 << 7), - PATH_RAY_SHADOW_TRANSPARENT = (1 << 8), - PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT), + PATH_RAY_SHADOW_OPAQUE_NON_CATCHER = (1 << 7), + PATH_RAY_SHADOW_OPAQUE_CATCHER = (1 << 8), + PATH_RAY_SHADOW_OPAQUE = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_OPAQUE_CATCHER), + PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER = (1 << 9), + PATH_RAY_SHADOW_TRANSPARENT_CATCHER = (1 << 10), + PATH_RAY_SHADOW_TRANSPARENT = (PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_CATCHER), + PATH_RAY_SHADOW_NON_CATCHER = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER), + PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT), - PATH_RAY_CURVE = (1 << 9), /* visibility flag to define curve segments */ - PATH_RAY_VOLUME_SCATTER = (1 << 10), /* volume scattering */ + PATH_RAY_CURVE = (1 << 11), /* visibility flag to define curve segments */ + PATH_RAY_VOLUME_SCATTER = (1 << 12), /* volume scattering */ /* Special flag to tag unaligned BVH nodes. */ - PATH_RAY_NODE_UNALIGNED = (1 << 11), + PATH_RAY_NODE_UNALIGNED = (1 << 13), - PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1), + PATH_RAY_ALL_VISIBILITY = ((1 << 14)-1), - PATH_RAY_MIS_SKIP = (1 << 12), - PATH_RAY_DIFFUSE_ANCESTOR = (1 << 13), - PATH_RAY_SINGLE_PASS_DONE = (1 << 14), - PATH_RAY_SHADOW_CATCHER = (1 << 15), - PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16), - PATH_RAY_STORE_SHADOW_INFO = (1 << 17), + PATH_RAY_MIS_SKIP = (1 << 15), + PATH_RAY_DIFFUSE_ANCESTOR = (1 << 16), + PATH_RAY_SINGLE_PASS_DONE = (1 << 17), + PATH_RAY_SHADOW_CATCHER = (1 << 18), + PATH_RAY_STORE_SHADOW_INFO = (1 << 19), }; /* Closure Label */ @@ -462,18 +454,42 @@ typedef enum DenoiseFlag { DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1, } DenoiseFlag; +#ifdef __KERNEL_DEBUG__ +/* NOTE: This is a runtime-only struct, alignment is not + * really important here. + */ +typedef struct DebugData { + int num_bvh_traversed_nodes; + int num_bvh_traversed_instances; + int num_bvh_intersections; + int num_ray_bounces; +} DebugData; +#endif + +typedef ccl_addr_space struct PathRadianceState { +#ifdef __PASSES__ + float3 diffuse; + float3 glossy; + float3 transmission; + float3 subsurface; + float3 scatter; + + float3 direct; +#endif +} PathRadianceState; + typedef ccl_addr_space struct PathRadiance { #ifdef __PASSES__ int use_light_pass; #endif + float transparent; float3 emission; #ifdef __PASSES__ float3 background; float3 ao; float3 indirect; - float3 direct_throughput; float3 direct_emission; float3 color_diffuse; @@ -494,16 +510,12 @@ typedef ccl_addr_space struct PathRadiance { float3 indirect_subsurface; float3 indirect_scatter; - float3 path_diffuse; - float3 path_glossy; - float3 path_transmission; - float3 path_subsurface; - float3 path_scatter; - float4 shadow; float mist; #endif + struct PathRadianceState state; + #ifdef __SHADOW_TRICKS__ /* Total light reachable across the path, ignoring shadow blocked queries. */ float3 path_total; @@ -515,7 +527,18 @@ typedef ccl_addr_space struct PathRadiance { float3 path_total_shaded; /* Color of the background on which shadow is alpha-overed. */ - float3 shadow_color; + float3 shadow_background_color; + + /* Path radiance sum and throughput at the moment when ray hits shadow + * catcher object. + */ + float shadow_throughput; + + /* Accumulated transparency along the path after shadow catcher bounce. */ + float shadow_transparency; + + /* Indicate if any shadow catcher data is set. */ + int has_shadow_catcher; #endif #ifdef __DENOISING_FEATURES__ @@ -523,6 +546,10 @@ typedef ccl_addr_space struct PathRadiance { float3 denoising_albedo; float denoising_depth; #endif /* __DENOISING_FEATURES__ */ + +#ifdef __KERNEL_DEBUG__ + DebugData debug_data; +#endif /* __KERNEL_DEBUG__ */ } PathRadiance; typedef struct BsdfEval { @@ -774,20 +801,6 @@ typedef ccl_addr_space struct ccl_align(16) ShaderClosure { float data[10]; /* pad to 80 bytes */ } ShaderClosure; -/* Shader Context - * - * For OSL we recycle a fixed number of contexts for speed */ - -typedef enum ShaderContext { - SHADER_CONTEXT_MAIN = 0, - SHADER_CONTEXT_INDIRECT = 1, - SHADER_CONTEXT_EMISSION = 2, - SHADER_CONTEXT_SHADOW = 3, - SHADER_CONTEXT_SSS = 4, - SHADER_CONTEXT_VOLUME = 5, - SHADER_CONTEXT_NUM = 6 -} ShaderContext; - /* Shader Data * * Main shader state at a point on the surface or in a volume. All coordinates @@ -850,7 +863,7 @@ enum ShaderDataFlag { SD_VOLUME_MIS = (1 << 23), /* Use cubic interpolation for voxels. */ SD_VOLUME_CUBIC = (1 << 24), - /* Has data connected to the displacement input. */ + /* Has data connected to the displacement input or uses bump map. */ SD_HAS_BUMP = (1 << 25), /* Has true displacement. */ SD_HAS_DISPLACEMENT = (1 << 26), @@ -991,9 +1004,11 @@ typedef struct PathState { int flag; /* random number generator state */ - int rng_offset; /* dimension offset */ - int sample; /* path sample number */ - int num_samples; /* total number of times this path will be sampled */ + uint rng_hash; /* per pixel hash */ + int rng_offset; /* dimension offset */ + int sample; /* path sample number */ + int num_samples; /* total number of times this path will be sampled */ + float branch_factor; /* number of branches in indirect paths */ /* bounce counting */ int bounce; @@ -1016,20 +1031,15 @@ typedef struct PathState { /* volume rendering */ #ifdef __VOLUME__ int volume_bounce; - RNG rng_congruential; + uint rng_congruential; VolumeStack volume_stack[VOLUME_STACK_SIZE]; #endif - -#ifdef __SHADOW_TRICKS__ - int catcher_object; -#endif } PathState; /* Subsurface */ /* Struct to gather multiple SSS hits. */ -typedef struct SubsurfaceIntersection -{ +typedef struct SubsurfaceIntersection { Ray ray; float3 weight[BSSRDF_MAX_HITS]; @@ -1039,17 +1049,14 @@ typedef struct SubsurfaceIntersection } SubsurfaceIntersection; /* Struct to gather SSS indirect rays and delay tracing them. */ -typedef struct SubsurfaceIndirectRays -{ - bool need_update_volume_stack; - bool tracing; +typedef struct SubsurfaceIndirectRays { PathState state[BSSRDF_MAX_HITS]; - struct PathRadiance direct_L; int num_rays; + struct Ray rays[BSSRDF_MAX_HITS]; float3 throughputs[BSSRDF_MAX_HITS]; - struct PathRadiance L[BSSRDF_MAX_HITS]; + struct PathRadianceState L_state[BSSRDF_MAX_HITS]; } SubsurfaceIndirectRays; /* Constant Kernel Data @@ -1228,7 +1235,6 @@ typedef struct KernelIntegrator { int portal_offset; /* bounces */ - int min_bounce; int max_bounce; int max_diffuse_bounce; @@ -1239,7 +1245,6 @@ typedef struct KernelIntegrator { int ao_bounces; /* transparent */ - int transparent_min_bounce; int transparent_max_bounce; int transparent_shadows; @@ -1282,7 +1287,7 @@ typedef struct KernelIntegrator { float light_inv_rr_threshold; int start_sample; - int pad1, pad2, pad3; + int pad1; } KernelIntegrator; static_assert_align(KernelIntegrator, 16); @@ -1336,18 +1341,6 @@ typedef struct KernelData { } KernelData; static_assert_align(KernelData, 16); -#ifdef __KERNEL_DEBUG__ -/* NOTE: This is a runtime-only struct, alignment is not - * really important here. - */ -typedef ccl_addr_space struct DebugData { - int num_bvh_traversed_nodes; - int num_bvh_traversed_instances; - int num_bvh_intersections; - int num_ray_bounces; -} DebugData; -#endif - /* Declarations required for split kernel */ /* Macro for queues */ diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index 1e472aaf51a..d9c310a893e 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -43,7 +43,7 @@ ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg, float3 *extinction) { sd->P = P; - shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW); + shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW); if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER))) return false; @@ -69,7 +69,7 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals *kg, VolumeShaderCoefficients *coeff) { sd->P = P; - shader_eval_volume(kg, sd, state, state->volume_stack, state->flag, SHADER_CONTEXT_VOLUME); + shader_eval_volume(kg, sd, state, state->volume_stack, state->flag); if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER|SD_EMISSION))) return false; @@ -360,7 +360,6 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous( ShaderData *sd, PathRadiance *L, ccl_addr_space float3 *throughput, - RNG *rng, bool probalistic_scatter) { VolumeShaderCoefficients coeff; @@ -380,13 +379,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous( /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); int channel = (int)(rphase*3.0f); - sd->randb_closure = rphase*3.0f - channel; /* decide if we will hit or miss */ bool scatter = true; - float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); + float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); if(probalistic_scatter) { float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel); @@ -439,7 +437,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous( float3 sigma_t = coeff.sigma_a + coeff.sigma_s; float3 transmittance = volume_color_transmittance(sigma_t, ray->t); float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, ray->t); - path_radiance_accum_emission(L, *throughput, emission, state->bounce); + path_radiance_accum_emission(L, state, *throughput, emission); } /* modify throughput */ @@ -468,8 +466,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( Ray *ray, ShaderData *sd, PathRadiance *L, - ccl_addr_space float3 *throughput, - RNG *rng) + ccl_addr_space float3 *throughput) { float3 tp = *throughput; const float tp_eps = 1e-6f; /* todo: this is likely not the right value */ @@ -485,10 +482,9 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ - float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); + float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); int channel = (int)(rphase*3.0f); - sd->randb_closure = rphase*3.0f - channel; bool has_scatter = false; for(int i = 0; i < max_steps; i++) { @@ -560,7 +556,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( /* integrate emission attenuated by absorption */ if(L && (closure_flag & SD_EMISSION)) { float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt); - path_radiance_accum_emission(L, tp, emission, state->bounce); + path_radiance_accum_emission(L, state, tp, emission); } /* modify throughput */ @@ -610,15 +606,14 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate( Ray *ray, PathRadiance *L, ccl_addr_space float3 *throughput, - RNG *rng, bool heterogeneous) { shader_setup_from_volume(kg, sd, ray); if(heterogeneous) - return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, rng); + return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput); else - return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, rng, true); + return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true); } #ifndef __SPLIT_KERNEL__ @@ -846,7 +841,6 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ int channel = (int)(rphase*3.0f); - sd->randb_closure = rphase*3.0f - channel; float xi = rscatter; /* probabilistic scattering decision based on transmittance */ @@ -1000,8 +994,8 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( mis_weight = 2.0f*power_heuristic(pdf, distance_pdf); } } - if(sample_t < 1e-6f || pdf == 0.0f) { - return VOLUME_PATH_SCATTERED; + if(sample_t < 0.0f || pdf == 0.0f) { + return VOLUME_PATH_MISSED; } /* compute transmittance up to this step */ diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h index 28fc5ce1c30..0c11158e8da 100644 --- a/intern/cycles/kernel/kernel_work_stealing.h +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -27,90 +27,54 @@ CCL_NAMESPACE_BEGIN # pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #endif -ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg) -{ - return kernel_split_params.w * kernel_split_params.h * kernel_split_params.num_samples; -} - -ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg) -{ - return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE; -} - -ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint ray_index) -{ - return ray_index / WORK_POOL_SIZE; -} - -ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool) -{ - uint total_work_size = kernel_total_work_size(kg); - uint num_pools = kernel_num_work_pools(kg); - - if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= total_work_size) { - return 0; - } - - uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * WORK_POOL_SIZE; - - uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE)); - if(work_pool < remainder / WORK_POOL_SIZE) { - work_size += WORK_POOL_SIZE; - } - else if(work_pool == remainder / WORK_POOL_SIZE) { - work_size += remainder % WORK_POOL_SIZE; - } - - return work_size; -} - -ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint work_index, uint ray_index) -{ - uint num_pools = kernel_num_work_pools(kg); - uint pool = work_pool_from_ray_index(kg, ray_index); - - return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE) - + (pool * WORK_POOL_SIZE) - + (work_index % WORK_POOL_SIZE); -} - /* Returns true if there is work */ -ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, uint ray_index) +ccl_device bool get_next_work(KernelGlobals *kg, + uint thread_index, + ccl_private uint *global_work_index) { - uint work_pool = work_pool_from_ray_index(kg, ray_index); - uint pool_size = work_pool_work_size(kg, work_pool); + uint total_work_size = kernel_split_params.w + * kernel_split_params.h + * kernel_split_params.num_samples; - if(pool_size == 0) { + /* With a small amount of work there may be more threads than work due to + * rounding up of global size, stop such threads immediately. */ + if(thread_index >= total_work_size) { return false; } - *work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]); - return (*work_index < pool_size); -} + /* Increase atomic work index counter in pool. */ + uint pool = thread_index / WORK_POOL_SIZE; + uint work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[pool]); -/* This function assumes that the passed `work` is valid. */ -/* Decode sample number w.r.t. assigned `work`. */ -ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint ray_index) -{ - return get_global_work_index(kg, work_index, ray_index) / (kernel_split_params.w * kernel_split_params.h); -} + /* Map per-pool work index to a global work index. */ + uint global_size = ccl_global_size(0) * ccl_global_size(1); + kernel_assert(global_size % WORK_POOL_SIZE == 0); + kernel_assert(thread_index < global_size); -/* Decode pixel and tile position w.r.t. assigned `work`. */ -ccl_device void get_work_pixel_tile_position(KernelGlobals *kg, - ccl_private uint *pixel_x, - ccl_private uint *pixel_y, - ccl_private uint *tile_x, - ccl_private uint *tile_y, - uint work_index, - uint ray_index) -{ - uint pixel_index = get_global_work_index(kg, work_index, ray_index) % (kernel_split_params.w*kernel_split_params.h); + *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + + (pool * WORK_POOL_SIZE) + + (work_index % WORK_POOL_SIZE); - *tile_x = pixel_index % kernel_split_params.w; - *tile_y = pixel_index / kernel_split_params.w; + /* Test if all work for this pool is done. */ + return (*global_work_index < total_work_size); +} - *pixel_x = *tile_x + kernel_split_params.x; - *pixel_y = *tile_y + kernel_split_params.y; +/* Map global work index to pixel X/Y and sample. */ +ccl_device_inline void get_work_pixel(KernelGlobals *kg, + uint global_work_index, + ccl_private uint *x, + ccl_private uint *y, + ccl_private uint *sample) +{ + uint tile_pixels = kernel_split_params.w * kernel_split_params.h; + uint sample_offset = global_work_index / tile_pixels; + uint pixel_offset = global_work_index - sample_offset * tile_pixels; + uint y_offset = pixel_offset / kernel_split_params.w; + uint x_offset = pixel_offset - y_offset * kernel_split_params.w; + + *x = kernel_split_params.x + x_offset; + *y = kernel_split_params.y + y_offset; + *sample = kernel_split_params.start_sample + sample_offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp index 1a7b2040da1..254025be4e2 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp @@ -25,6 +25,7 @@ #else /* SSE optimization disabled for now on 32 bit, see bug #36316 */ # if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ # define __KERNEL_SSE2__ # define __KERNEL_SSE3__ # define __KERNEL_SSSE3__ diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h index 9fa39dc9ebb..7ae205b7e14 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_config.h +++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h @@ -81,8 +81,13 @@ # error "Unknown or unsupported CUDA architecture, can't determine launch bounds" #endif -/* compute number of threads per block and minimum blocks per multiprocessor - * given the maximum number of registers per thread */ +/* For split kernel using all registers seems fastest for now, but this + * is unlikely to be optimal once we resolve other bottlenecks. */ + +#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS + +/* Compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread. */ #define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ __launch_bounds__( \ diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu index 628891b1458..e97e87285a5 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -90,7 +90,7 @@ kernel_cuda_path_trace_data_init( #define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ extern "C" __global__ void \ - CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ kernel_cuda_##name() \ { \ kernel_##name(NULL); \ @@ -98,7 +98,7 @@ kernel_cuda_path_trace_data_init( #define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ extern "C" __global__ void \ - CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ kernel_cuda_##name() \ { \ ccl_local type locals; \ diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index ba53ba4b26f..f015ac47d8a 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -235,7 +235,7 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(int dx, } __kernel void kernel_ocl_filter_finalize(int w, - int h, + int h, ccl_global float *buffer, ccl_global int *rank, ccl_global float *XtWX, diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl index 078acc1631e..b7108f3d0f8 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel.cl @@ -52,9 +52,7 @@ __kernel void kernel_ocl_path_trace( ccl_global float *buffer, ccl_global uint *rng_state, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, int sample, int sx, int sy, int sw, int sh, int offset, int stride) @@ -63,9 +61,8 @@ __kernel void kernel_ocl_path_trace( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); int x = sx + ccl_global_id(0); int y = sy + ccl_global_id(1); @@ -82,9 +79,7 @@ __kernel void kernel_ocl_shader( ccl_global float4 *output, ccl_global float *output_luma, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, int type, int sx, int sw, int offset, int sample) { @@ -92,9 +87,8 @@ __kernel void kernel_ocl_shader( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); int x = sx + ccl_global_id(0); @@ -114,9 +108,7 @@ __kernel void kernel_ocl_bake( ccl_global uint4 *input, ccl_global float4 *output, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, int type, int filter, int sx, int sw, int offset, int sample) { @@ -124,9 +116,8 @@ __kernel void kernel_ocl_bake( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); int x = sx + ccl_global_id(0); @@ -144,9 +135,7 @@ __kernel void kernel_ocl_convert_to_byte( ccl_global uchar4 *rgba, ccl_global float *buffer, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) @@ -155,9 +144,8 @@ __kernel void kernel_ocl_convert_to_byte( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); int x = sx + ccl_global_id(0); int y = sy + ccl_global_id(1); @@ -171,9 +159,7 @@ __kernel void kernel_ocl_convert_to_half_float( ccl_global uchar4 *rgba, ccl_global float *buffer, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) @@ -182,9 +168,8 @@ __kernel void kernel_ocl_convert_to_half_float( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); int x = sx + ccl_global_id(0); int y = sy + ccl_global_id(1); @@ -193,7 +178,7 @@ __kernel void kernel_ocl_convert_to_half_float( kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); } -__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, ulong size, ulong offset) +__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset) { size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0); diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl index 8b85d362f8a..95b35e40a45 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl @@ -25,11 +25,7 @@ __kernel void kernel_ocl_path_trace_data_init( int num_elements, ccl_global char *ray_state, ccl_global uint *rng_state, - -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" - + KERNEL_BUFFER_PARAMS, int start_sample, int end_sample, int sx, int sy, int sw, int sh, int offset, int stride, @@ -46,10 +42,7 @@ __kernel void kernel_ocl_path_trace_data_init( num_elements, ray_state, rng_state, - -#define KERNEL_TEX(type, ttype, name) name, -#include "kernel/kernel_textures.h" - + KERNEL_BUFFER_ARGS, start_sample, end_sample, sx, sy, sw, sh, offset, stride, diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl index 651addb02f4..4cbda1bc2e7 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_split.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl @@ -14,6 +14,9 @@ * limitations under the License. */ +#include "kernel/kernel_compat_opencl.h" // PRECOMPILED +#include "kernel/split/kernel_split_common.h" // PRECOMPILED + #include "kernel/kernels/opencl/kernel_state_buffer_size.cl" #include "kernel/kernels/opencl/kernel_data_init.cl" #include "kernel/kernels/opencl/kernel_path_init.cl" diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h index f1e914a70d4..591c3846ef2 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h +++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h @@ -25,9 +25,7 @@ __kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)( ccl_global char *ray_state, ccl_global uint *rng_state, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, ccl_global int *queue_index, ccl_global char *use_queues_flag, @@ -52,12 +50,9 @@ __kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)( split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state); -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" } - ccl_barrier(CCL_LOCAL_MEM_FENCE); + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); KERNEL_NAME_EVAL(kernel, KERNEL_NAME)( kg diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h index 02c083a83f8..9585d9f4825 100644 --- a/intern/cycles/kernel/osl/osl_globals.h +++ b/intern/cycles/kernel/osl/osl_globals.h @@ -86,7 +86,7 @@ struct OSLThreadData { OSL::ShaderGlobals globals; OSL::PerThreadInfo *osl_thread_info; OSLTraceData tracedata; - OSL::ShadingContext *context[SHADER_CONTEXT_NUM]; + OSL::ShadingContext *context; OIIO::TextureSystem::Perthread *oiio_thread_info; }; diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index 1535496c73d..8ad2e12b067 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -1197,8 +1197,9 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg, tracedata->init = true; tracedata->sd.osl_globals = sd->osl_globals; - /* raytrace */ - return scene_intersect(sd->osl_globals, ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f); + /* Raytrace, leaving out shadow opaque to avoid early exit. */ + uint visibility = PATH_RAY_ALL_VISIBILITY - PATH_RAY_SHADOW_OPAQUE; + return scene_intersect(sd->osl_globals, ray, visibility, &tracedata->isect, NULL, 0.0f, 0.0f); } diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp index 13b19d86eca..9a37e0987aa 100644 --- a/intern/cycles/kernel/osl/osl_shader.cpp +++ b/intern/cycles/kernel/osl/osl_shader.cpp @@ -57,9 +57,7 @@ void OSLShader::thread_init(KernelGlobals *kg, KernelGlobals *kernel_globals, OS tdata->globals.tracedata = &tdata->tracedata; tdata->globals.flipHandedness = false; tdata->osl_thread_info = ss->create_thread_info(); - - for(int i = 0; i < SHADER_CONTEXT_NUM; i++) - tdata->context[i] = ss->get_context(tdata->osl_thread_info); + tdata->context = ss->get_context(tdata->osl_thread_info); tdata->oiio_thread_info = osl_globals->ts->get_perthread_info(); @@ -74,9 +72,7 @@ void OSLShader::thread_free(KernelGlobals *kg) OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSLThreadData *tdata = kg->osl_tdata; - - for(int i = 0; i < SHADER_CONTEXT_NUM; i++) - ss->release_context(tdata->context[i]); + ss->release_context(tdata->context); ss->destroy_thread_info(tdata->osl_thread_info); @@ -173,7 +169,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, } } -void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx) +void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -182,7 +178,7 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state /* execute shader for this point */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; int shader = sd->shader & SHADER_MASK; /* automatic bump shader */ @@ -274,7 +270,7 @@ static void flatten_background_closure_tree(ShaderData *sd, } } -void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx) +void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -283,7 +279,7 @@ void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *st /* execute shader for this point */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; if(kg->osl->background_state) { ss->execute(octx, *(kg->osl->background_state), *globals); @@ -329,7 +325,7 @@ static void flatten_volume_closure_tree(ShaderData *sd, } } -void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx) +void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -338,7 +334,7 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, /* execute shader */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; int shader = sd->shader & SHADER_MASK; if(kg->osl->volume_state[shader]) { @@ -352,7 +348,7 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, /* Displacement */ -void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx) +void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -364,7 +360,7 @@ void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderConte /* execute shader */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; int shader = sd->shader & SHADER_MASK; if(kg->osl->displacement_state[shader]) { diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h index 32121e940b4..f7020d1223d 100644 --- a/intern/cycles/kernel/osl/osl_shader.h +++ b/intern/cycles/kernel/osl/osl_shader.h @@ -53,10 +53,10 @@ public: static void thread_free(KernelGlobals *kg); /* eval */ - static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx); - static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx); - static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx); - static void eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx); + static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); + static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); + static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); + static void eval_displacement(KernelGlobals *kg, ShaderData *sd); /* attributes */ static int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeDescriptor *desc); diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl index 2bb981c3918..6870d479af3 100644 --- a/intern/cycles/kernel/shaders/node_principled_bsdf.osl +++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl @@ -76,8 +76,8 @@ shader node_principled_bsdf( float aspect = sqrt(1.0 - Anisotropic * 0.9); float r2 = Roughness * Roughness; - float alpha_x = max(0.001, r2 / aspect); - float alpha_y = max(0.001, r2 * aspect); + float alpha_x = r2 / aspect; + float alpha_y = r2 * aspect; color tmp_col = color(1.0, 1.0, 1.0) * (1.0 - SpecularTint) + m_ctint * SpecularTint; diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h index e2762a85fc8..2313feac089 100644 --- a/intern/cycles/kernel/split/kernel_branched.h +++ b/intern/cycles/kernel/split/kernel_branched.h @@ -87,7 +87,6 @@ ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray]; path_radiance_init(inactive_L, kernel_data.film.use_light_pass); - inactive_L->direct_throughput = L->direct_throughput; path_radiance_copy_indirect(inactive_L, L); ray_state[inactive_ray] = RAY_REGENERATED; @@ -110,7 +109,6 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter( SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; ShaderData *sd = saved_sd; - RNG rng = kernel_split_state.rng[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; float3 throughput = branched_state->throughput; ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; @@ -157,37 +155,38 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter( num_samples = ceil_to_int(num_samples_adjust*num_samples); float num_samples_inv = num_samples_adjust/num_samples; - RNG bsdf_rng = cmj_hash(rng, i); for(int j = branched_state->next_sample; j < num_samples; j++) { if(reset_path_state) { *ps = branched_state->path_state; } + ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i); + ccl_global float3 *tp = &kernel_split_state.throughput[ray_index]; *tp = throughput; ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index]; if(!kernel_branched_path_surface_bounce(kg, - &bsdf_rng, sd, sc, j, num_samples, tp, ps, - L, + &L->state, bsdf_ray, sum_sample_weight)) { continue; } + ps->rng_hash = branched_state->path_state.rng_hash; + /* update state for next iteration */ branched_state->next_closure = i; branched_state->next_sample = j+1; - branched_state->num_samples = num_samples; /* start the indirect path */ *tp *= num_samples_inv; diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h index 4c1fdd2d69c..c9e7deddafa 100644 --- a/intern/cycles/kernel/split/kernel_buffer_update.h +++ b/intern/cycles/kernel/split/kernel_buffer_update.h @@ -75,92 +75,59 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, if(ray_index != QUEUE_EMPTY_SLOT) { #endif - ccl_global uint *rng_state = kernel_split_params.rng_state; int stride = kernel_split_params.stride; ccl_global char *ray_state = kernel_split_state.ray_state; -#ifdef __KERNEL_DEBUG__ - DebugData *debug_data = &kernel_split_state.debug_data[ray_index]; -#endif ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; - ccl_global float *buffer = kernel_split_params.buffer; - - unsigned int work_index; - ccl_global uint *initial_rng; - - unsigned int sample; - unsigned int tile_x; - unsigned int tile_y; - unsigned int pixel_x; - unsigned int pixel_y; - - work_index = kernel_split_state.work_array[ray_index]; - sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; - get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, - &tile_x, &tile_y, - work_index, - ray_index); - initial_rng = rng_state; - - rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride; - buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { -#ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, buffer, state, debug_data, sample); -#endif + uint sample = state->sample; + uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; + ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; /* accumulate result in output buffer */ - bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER); - kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher); - - path_rng_end(kg, rng_state, rng); + kernel_write_result(kg, buffer, sample, L); ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); } if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { /* We have completed current work; So get next work */ - int valid_work = get_next_work(kg, &work_index, ray_index); - if(!valid_work) { + uint work_index; + if(!get_next_work(kg, ray_index, &work_index)) { /* If work is invalid, this means no more work is available and the thread may exit */ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); } if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { - kernel_split_state.work_array[ray_index] = work_index; - /* Get the sample associated with the current work */ - sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; - /* Get pixel and tile position associated with current work */ - get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index); + uint x, y, sample; + get_work_pixel(kg, work_index, &x, &y, &sample); - /* Remap rng_state according to the current work */ - rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*stride; - /* Remap buffer according to the current work */ - buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; + /* Remap rng_state to current pixel. */ + ccl_global uint *rng_state = kernel_split_params.rng_state; + rng_state += kernel_split_params.offset + x + y*stride; + + /* Store buffer offset for writing to passes. */ + uint buffer_offset = (kernel_split_params.offset + x + y*stride) * kernel_data.film.pass_stride; + kernel_split_state.buffer_offset[ray_index] = buffer_offset; /* Initialize random numbers and ray. */ - kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, &rng, ray); + uint rng_hash; + kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, ray); if(ray->t != 0.0f) { - /* Initialize throughput, L_transparent, Ray, PathState; + /* Initialize throughput, path radiance, Ray, PathState; * These rays proceed with path-iteration. */ *throughput = make_float3(1.0f, 1.0f, 1.0f); - *L_transparent = 0.0f; path_radiance_init(L, kernel_data.film.use_light_pass); - path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &rng, sample, ray); + path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, rng_hash, sample, ray); #ifdef __SUBSURFACE__ kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]); #endif -#ifdef __KERNEL_DEBUG__ - debug_data_init(debug_data); -#endif ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); enqueue_flag = 1; } @@ -168,14 +135,13 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, /* These rays do not participate in path-iteration. */ float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); /* Accumulate result in output buffer. */ + ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; kernel_write_pass_float4(buffer, sample, L_rad); - path_rng_end(kg, rng_state, rng); ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); } } } - kernel_split_state.rng[ray_index] = rng; #ifndef __COMPUTE_DEVICE_GPU__ } diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h index e4545d66eff..2c042dfde6f 100644 --- a/intern/cycles/kernel/split/kernel_data_init.h +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -52,9 +52,7 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( ccl_global uint *rng_state, #ifdef __KERNEL_OPENCL__ -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "kernel/kernel_textures.h" + KERNEL_BUFFER_PARAMS, #endif int start_sample, @@ -100,9 +98,8 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state); #ifdef __KERNEL_OPENCL__ -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "kernel/kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); #endif int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); @@ -127,14 +124,25 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( /* zero the tiles pixels and initialize rng_state if this is the first sample */ if(start_sample == 0) { - parallel_for(kg, i, sw * sh * kernel_data.film.pass_stride) { - int pixel = i / kernel_data.film.pass_stride; - int pass = i % kernel_data.film.pass_stride; + int pass_stride = kernel_data.film.pass_stride; + +#ifdef __KERNEL_CPU__ + for(int y = sy; y < sy + sh; y++) { + int index = offset + y * stride; + memset(buffer + (sx + index) * pass_stride, 0, sizeof(float) * pass_stride * sw); + for(int x = sx; x < sx + sw; x++) { + rng_state[index + x] = hash_int_2d(x, y); + } + } +#else + parallel_for(kg, i, sw * sh * pass_stride) { + int pixel = i / pass_stride; + int pass = i % pass_stride; int x = sx + pixel % sw; int y = sy + pixel / sw; - int index = (offset + x + y*stride) * kernel_data.film.pass_stride + pass; + int index = (offset + x + y*stride) * pass_stride + pass; *(buffer + index) = 0.0f; } @@ -146,6 +154,7 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( int index = (offset + x + y*stride); *(rng_state + index) = hash_int_2d(x, y); } +#endif } #endif /* KERENL_STUB */ diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h index 3336c968a44..2aac66ecb84 100644 --- a/intern/cycles/kernel/split/kernel_direct_lighting.h +++ b/intern/cycles/kernel/split/kernel_direct_lighting.h @@ -62,8 +62,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, /* direct lighting */ #ifdef __EMISSION__ - RNG rng = kernel_split_state.rng[ray_index]; - bool flag = (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)); @@ -83,23 +81,20 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, if(flag) { /* Sample illumination from lights to find path contribution. */ - float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, &rng, state, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_state_rng_light_termination(kg, &rng, state); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_state_rng_light_termination(kg, state); LightSample ls; if(light_sample(kg, - light_t, light_u, light_v, + light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { Ray light_ray; -# ifdef __OBJECT_MOTION__ light_ray.time = sd->time; -# endif BsdfEval L_light; bool is_lamp; @@ -115,7 +110,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, } } } - kernel_split_state.rng[ray_index] = rng; #endif /* __EMISSION__ */ } diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h index 9f8dd2392d9..491487f1230 100644 --- a/intern/cycles/kernel/split/kernel_do_volume.h +++ b/intern/cycles/kernel/split/kernel_do_volume.h @@ -30,7 +30,6 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; ShaderData *sd = &kernel_split_state.sd[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; @@ -58,22 +57,21 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K /* integrate along volume segment with distance sampling */ VolumeIntegrateResult result = kernel_volume_integrate( - kg, ps, sd, &volume_ray, L, tp, &rng, heterogeneous); + kg, ps, sd, &volume_ray, L, tp, heterogeneous); # ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* direct lighting */ - kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *tp, &branched_state->path_state, L); + kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L); /* indirect light bounce */ - if(!kernel_path_volume_bounce(kg, &rng, sd, tp, ps, L, pray)) { + if(!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) { continue; } /* start the indirect path */ branched_state->next_closure = 0; branched_state->next_sample = j+1; - branched_state->num_samples = num_samples; /* Attempting to share too many samples is slow for volumes as it causes us to * loop here more and have many calls to kernel_volume_integrate which evaluates @@ -141,7 +139,6 @@ ccl_device void kernel_do_volume(KernelGlobals *kg) IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; ccl_global Intersection *isect = &kernel_split_state.isect[ray_index]; ShaderData *sd = &kernel_split_state.sd[ray_index]; ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; @@ -165,15 +162,15 @@ ccl_device void kernel_do_volume(KernelGlobals *kg) { /* integrate along volume segment with distance sampling */ VolumeIntegrateResult result = kernel_volume_integrate( - kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous); + kg, state, sd, &volume_ray, L, throughput, heterogeneous); # ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* direct lighting */ - kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *throughput, state, L); + kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L); /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) { + if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) { ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); } else { @@ -194,8 +191,6 @@ ccl_device void kernel_do_volume(KernelGlobals *kg) } # endif /* __BRANCHED_PATH__ */ } - - kernel_split_state.rng[ray_index] = rng; } # ifdef __BRANCHED_PATH__ diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index 670a557f084..dffd291012d 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -90,163 +90,58 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( if(ray_index != QUEUE_EMPTY_SLOT) { #endif - int stride = kernel_split_params.stride; - - unsigned int work_index; - unsigned int pixel_x; - unsigned int pixel_y; - - unsigned int tile_x; - unsigned int tile_y; - unsigned int sample; - - RNG rng = kernel_split_state.rng[ray_index]; ccl_global PathState *state = 0x0; float3 throughput; ccl_global char *ray_state = kernel_split_state.ray_state; ShaderData *sd = &kernel_split_state.sd[ray_index]; - ccl_global float *buffer = kernel_split_params.buffer; if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; + ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; + + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; throughput = kernel_split_state.throughput[ray_index]; state = &kernel_split_state.path_state[ray_index]; - work_index = kernel_split_state.work_array[ray_index]; - sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; - get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, - &tile_x, &tile_y, - work_index, - ray_index); - - buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride; - -#ifdef __SHADOW_TRICKS__ - if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) { - if(state->flag & PATH_RAY_CAMERA) { - state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO); - state->catcher_object = sd->object; - if(!kernel_data.background.transparent) { - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - L->shadow_color = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray); - } - } - } - else { - state->flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY; - } -#endif /* __SHADOW_TRICKS__ */ - - /* holdout */ -#ifdef __HOLDOUT__ - if(((sd->flag & SD_HOLDOUT) || - (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) && - (state->flag & PATH_RAY_CAMERA)) + if(!kernel_path_shader_apply(kg, + sd, + state, + ray, + throughput, + emission_sd, + L, + buffer)) { - if(kernel_data.background.transparent) { - float3 holdout_weight; - if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { - holdout_weight = make_float3(1.0f, 1.0f, 1.0f); - } - else { - holdout_weight = shader_holdout_eval(kg, sd); - } - /* any throughput is ok, should all be identical here */ - kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput); - } - if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { - kernel_split_path_end(kg, ray_index); - } + kernel_split_path_end(kg, ray_index); } -#endif /* __HOLDOUT__ */ } if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - -#ifdef __BRANCHED_PATH__ - if(!IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) -#endif /* __BRANCHED_PATH__ */ - { - /* Holdout mask objects do not write data passes. */ - kernel_write_data_passes(kg, - buffer, - L, - sd, - sample, - state, - throughput); - } - - /* Blurring of bsdf after bounces, for rays that have a small likelihood - * of following this particular path (diffuse, rough glossy. - */ -#ifndef __BRANCHED_PATH__ - if(kernel_data.integrator.filter_glossy != FLT_MAX) -#else - if(kernel_data.integrator.filter_glossy != FLT_MAX && - (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT))) -#endif /* __BRANCHED_PATH__ */ - { - float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; - if(blur_pdf < 1.0f) { - float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; - shader_bsdf_blur(kg, sd, blur_roughness); - } - } - -#ifdef __EMISSION__ - /* emission */ - if(sd->flag & SD_EMISSION) { - /* TODO(sergey): is isect.t wrong here for transparent surfaces? */ - float3 emission = indirect_primitive_emission( - kg, - sd, - kernel_split_state.isect[ray_index].t, - state->flag, - state->ray_pdf); - path_radiance_accum_emission(L, throughput, emission, state->bounce); - } -#endif /* __EMISSION__ */ - /* Path termination. this is a strange place to put the termination, it's * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate. */ -#ifndef __BRANCHED_PATH__ - float probability = path_state_terminate_probability(kg, state, throughput); -#else - float probability = 1.0f; - - if(!kernel_data.integrator.branched) { - probability = path_state_terminate_probability(kg, state, throughput); - } - else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { - int num_samples = kernel_split_state.branched_state[ray_index].num_samples; - probability = path_state_terminate_probability(kg, state, throughput*num_samples); - } - else if(state->flag & PATH_RAY_TRANSPARENT) { - probability = path_state_terminate_probability(kg, state, throughput); - } -#endif + float probability = path_state_continuation_probability(kg, state, throughput); if(probability == 0.0f) { kernel_split_path_end(kg, ray_index); } - - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE); - if(terminate >= probability) { - kernel_split_path_end(kg, ray_index); - } - else { - kernel_split_state.throughput[ray_index] = throughput/probability; - } + else if(probability < 1.0f) { + float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); + if(terminate >= probability) { + kernel_split_path_end(kg, ray_index); } + else { + kernel_split_state.throughput[ray_index] = throughput/probability; + } + } + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; kernel_update_denoising_features(kg, sd, state, L); } } @@ -260,8 +155,6 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( } #endif /* __AO__ */ - kernel_split_state.rng[ray_index] = rng; - #ifndef __COMPUTE_DEVICE_GPU__ } #endif diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h index f0ebb90f60a..437043a5971 100644 --- a/intern/cycles/kernel/split/kernel_indirect_background.h +++ b/intern/cycles/kernel/split/kernel_indirect_background.h @@ -33,7 +33,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg) if(ray_index != QUEUE_EMPTY_SLOT) { if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - if(state->bounce > kernel_data.integrator.ao_bounces) { + if(path_state_ao_bounce(kg, state)) { kernel_split_path_end(kg, ray_index); } } @@ -50,33 +50,16 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg) return; } - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index]; - if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { - /* eval background shader if nothing hit */ - if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) { - *L_transparent = (*L_transparent) + average((*throughput)); -#ifdef __PASSES__ - if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) -#endif - kernel_split_path_end(kg, ray_index); - } + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + float3 throughput = kernel_split_state.throughput[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; - if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { -#ifdef __BACKGROUND__ - /* sample background shader */ - float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray); - path_radiance_accum_background(L, state, (*throughput), L_background); -#endif - kernel_split_path_end(kg, ray_index); - } + kernel_path_background(kg, state, ray, throughput, emission_sd, L); + kernel_split_path_end(kg, ray_index); } - - } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h index 82bc2f01fd7..e9fe5552e8c 100644 --- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h +++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h @@ -54,7 +54,6 @@ ccl_device void kernel_indirect_subsurface(KernelGlobals *kg) #endif if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; - kernel_path_subsurface_accum_indirect(ss_indirect, L); /* Trace indirect subsurface rays by restarting the loop. this uses less * stack memory than invoking kernel_path_indirect. diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h index c669d79ddcd..448456d167d 100644 --- a/intern/cycles/kernel/split/kernel_lamp_emission.h +++ b/intern/cycles/kernel/split/kernel_lamp_emission.h @@ -57,27 +57,10 @@ ccl_device void kernel_lamp_emission(KernelGlobals *kg) float3 throughput = kernel_split_state.throughput[ray_index]; Ray ray = kernel_split_state.ray[ray_index]; + ccl_global Intersection *isect = &kernel_split_state.isect[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; -#ifdef __LAMP_MIS__ - if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) { - /* ray starting from previous non-transparent bounce */ - Ray light_ray; - - light_ray.P = ray.P - state->ray_t*ray.D; - state->ray_t += kernel_split_state.isect[ray_index].t; - light_ray.D = ray.D; - light_ray.t = state->ray_t; - light_ray.time = ray.time; - light_ray.dD = ray.dD; - light_ray.dP = ray.dP; - /* intersect with lamp */ - float3 emission; - - if(indirect_lamp_emission(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &light_ray, &emission)) { - path_radiance_accum_emission(L, throughput, emission, state->bounce); - } - } -#endif /* __LAMP_MIS__ */ + kernel_path_lamp_emission(kg, state, &ray, throughput, isect, emission_sd, L); } } diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h index 7758e35fd32..c3373174582 100644 --- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h +++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h @@ -126,7 +126,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, if(active) { ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; ShaderData *sd = &kernel_split_state.sd[ray_index]; ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; @@ -135,7 +134,7 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { #endif /* Compute direct lighting and next bounce. */ - if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) { + if(!kernel_path_surface_bounce(kg, sd, throughput, state, &L->state, ray)) { kernel_split_path_end(kg, ray_index); } #ifdef __BRANCHED_PATH__ @@ -157,8 +156,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, } } #endif /* __BRANCHED_PATH__ */ - - kernel_split_state.rng[ray_index] = rng; } /* Enqueue RAY_UPDATE_BUFFER rays. */ diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h index a7ecde7c80d..0ab2289348b 100644 --- a/intern/cycles/kernel/split/kernel_path_init.h +++ b/intern/cycles/kernel/split/kernel_path_init.h @@ -29,77 +29,59 @@ ccl_device void kernel_path_init(KernelGlobals *kg) { */ kernel_split_state.ray_state[ray_index] = RAY_ACTIVE; - unsigned int my_sample; - unsigned int pixel_x; - unsigned int pixel_y; - unsigned int tile_x; - unsigned int tile_y; - - unsigned int work_index = 0; /* Get work. */ - if(!get_next_work(kg, &work_index, ray_index)) { + uint work_index; + if(!get_next_work(kg, ray_index, &work_index)) { /* No more work, mark ray as inactive */ kernel_split_state.ray_state[ray_index] = RAY_INACTIVE; return; } - /* Get the sample associated with the work. */ - my_sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; - - /* Get pixel and tile position associated with the work. */ - get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, - &tile_x, &tile_y, - work_index, - ray_index); - kernel_split_state.work_array[ray_index] = work_index; + uint x, y, sample; + get_work_pixel(kg, work_index, &x, &y, &sample); + /* Remap rng_state and buffer to current pixel. */ ccl_global uint *rng_state = kernel_split_params.rng_state; - rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride; - - ccl_global float *buffer = kernel_split_params.buffer; - buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride; + rng_state += kernel_split_params.offset + x + y*kernel_split_params.stride; - RNG rng = kernel_split_state.rng[ray_index]; + /* Store buffer offset for writing to passes. */ + uint buffer_offset = (kernel_split_params.offset + x + y*kernel_split_params.stride) * kernel_data.film.pass_stride; + kernel_split_state.buffer_offset[ray_index] = buffer_offset; /* Initialize random numbers and ray. */ + uint rng_hash; kernel_path_trace_setup(kg, rng_state, - my_sample, - pixel_x, pixel_y, - &rng, + sample, + x, y, + &rng_hash, &kernel_split_state.ray[ray_index]); if(kernel_split_state.ray[ray_index].t != 0.0f) { - /* Initialize throughput, L_transparent, Ray, PathState; + /* Initialize throughput, path radiance, Ray, PathState; * These rays proceed with path-iteration. */ kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f); - kernel_split_state.L_transparent[ray_index] = 0.0f; path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass); path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], &kernel_split_state.path_state[ray_index], - &rng, - my_sample, + rng_hash, + sample, &kernel_split_state.ray[ray_index]); #ifdef __SUBSURFACE__ kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]); #endif - -#ifdef __KERNEL_DEBUG__ - debug_data_init(&kernel_split_state.debug_data[ray_index]); -#endif } else { /* These rays do not participate in path-iteration. */ float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); /* Accumulate result in output buffer. */ - kernel_write_pass_float4(buffer, my_sample, L_rad); - path_rng_end(kg, rng_state, kernel_split_state.rng[ray_index]); + ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; + kernel_write_pass_float4(buffer, sample, L_rad); ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE); } - kernel_split_state.rng[ray_index] = rng; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h index 45984ca509b..f5378bc172b 100644 --- a/intern/cycles/kernel/split/kernel_scene_intersect.h +++ b/intern/cycles/kernel/split/kernel_scene_intersect.h @@ -59,52 +59,14 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg) return; } -#ifdef __KERNEL_DEBUG__ - DebugData *debug_data = &kernel_split_state.debug_data[ray_index]; -#endif - Intersection isect; - PathState state = kernel_split_state.path_state[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; Ray ray = kernel_split_state.ray[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - /* intersect scene */ - uint visibility = path_state_ray_visibility(kg, &state); - - if(state.bounce > kernel_data.integrator.ao_bounces) { - visibility = PATH_RAY_SHADOW; - ray.t = kernel_data.background.ao_distance; - } - -#ifdef __HAIR__ - float difl = 0.0f, extmax = 0.0f; - uint lcg_state = 0; - RNG rng = kernel_split_state.rng[ray_index]; - - if(kernel_data.bvh.have_curves) { - if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) { - float3 pixdiff = ray.dD.dx + ray.dD.dy; - /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ - difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; - } - - extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(&rng, state.rng_offset, state.sample, 0x51633e2d); - } - - bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax); -#else - bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f); -#endif + Intersection isect; + bool hit = kernel_path_scene_intersect(kg, state, &ray, &isect, L); kernel_split_state.isect[ray_index] = isect; -#ifdef __KERNEL_DEBUG__ - if(state.flag & PATH_RAY_CAMERA) { - debug_data->num_bvh_traversed_nodes += isect.num_traversed_nodes; - debug_data->num_bvh_traversed_instances += isect.num_traversed_instances; - debug_data->num_bvh_intersections += isect.num_intersections; - } - debug_data->num_ray_bounces++; -#endif - if(!hit) { /* Change the state of rays that hit the background; * These rays undergo special processing in the diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h index 2801b32f285..7032461b04a 100644 --- a/intern/cycles/kernel/split/kernel_shader_eval.h +++ b/intern/cycles/kernel/split/kernel_shader_eval.h @@ -48,30 +48,18 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg) ccl_global char *ray_state = kernel_split_state.ray_state; if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - RNG rng = kernel_split_state.rng[ray_index]; ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; -#ifndef __BRANCHED_PATH__ - float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF); - shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN); -#else - ShaderContext ctx = SHADER_CONTEXT_MAIN; - float rbsdf = 0.0f; - - if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { - rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF); - + shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, state->flag); +#ifdef __BRANCHED_PATH__ + if(kernel_data.integrator.branched) { + shader_merge_closures(&kernel_split_state.sd[ray_index]); } - - if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { - ctx = SHADER_CONTEXT_INDIRECT; + else +#endif + { + shader_prepare_closures(&kernel_split_state.sd[ray_index], state); } - - shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, ctx); - shader_merge_closures(&kernel_split_state.sd[ray_index]); -#endif /* __BRANCHED_PATH__ */ - - kernel_split_state.rng[ray_index] = rng; } } diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h index 297decb0bc2..5a55b680695 100644 --- a/intern/cycles/kernel/split/kernel_shader_sort.h +++ b/intern/cycles/kernel/split/kernel_shader_sort.h @@ -39,7 +39,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg, ccl_local ushort *local_index = &locals->local_index[0]; /* copy to local memory */ - for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { uint idx = offset + i + lid; uint add = input + idx; uint value = (~0); @@ -59,9 +59,9 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg, # ifdef __KERNEL_OPENCL__ /* bitonic sort */ - for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) { - for (uint inc = length; inc > 0; inc >>= 1) { - for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) { + for(uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) { + for(uint inc = length; inc > 0; inc >>= 1) { + for(uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) { uint i = lid + ii; bool direction = ((i & (length << 1)) != 0); uint j = i ^ inc; @@ -81,7 +81,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg, # endif /* __KERNEL_OPENCL__ */ /* copy to destination */ - for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { uint idx = offset + i + lid; uint lidx = local_index[i + lid]; uint outi = output + idx; diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h index 474286285a9..79aa2c9435b 100644 --- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h @@ -37,21 +37,18 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg) ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; float3 throughput = kernel_split_state.throughput[ray_index]; #ifdef __BRANCHED_PATH__ if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { #endif - kernel_path_ao(kg, sd, emission_sd, L, state, &rng, throughput, shader_bsdf_alpha(kg, sd)); + kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd)); #ifdef __BRANCHED_PATH__ } else { - kernel_branched_path_ao(kg, sd, emission_sd, L, state, &rng, throughput); + kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput); } #endif - - kernel_split_state.rng[ray_index] = rng; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h index 78e61709b01..b52f9a5eb81 100644 --- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h @@ -45,7 +45,6 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; ShaderData *sd = &kernel_split_state.sd[ray_index]; float3 throughput = kernel_split_state.throughput[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index]; ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; @@ -75,7 +74,6 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) if(use_branched) { kernel_branched_path_surface_connect_light(kg, - &rng, sd, emission_sd, state, @@ -91,10 +89,11 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) float3 shadow; if(!shadow_blocked(kg, - emission_sd, - state, - &ray, - &shadow)) + sd, + emission_sd, + state, + &ray, + &shadow)) { /* accumulate */ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); @@ -103,8 +102,6 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) path_radiance_accum_total_light(L, state, throughput, &L_light); } } - - kernel_split_state.rng[ray_index] = rng; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h index 08f0124b529..558d327bc76 100644 --- a/intern/cycles/kernel/split/kernel_split_common.h +++ b/intern/cycles/kernel/split/kernel_split_common.h @@ -63,7 +63,7 @@ ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index) PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray]; path_radiance_sum_indirect(L); - path_radiance_accum_sample(orig_ray_L, L, 1); + path_radiance_accum_sample(orig_ray_L, L); atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count); diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h index 4bb2f0d3d80..c58c8463f5c 100644 --- a/intern/cycles/kernel/split/kernel_split_data_types.h +++ b/intern/cycles/kernel/split/kernel_split_data_types.h @@ -56,14 +56,6 @@ typedef struct SplitParams { /* SPLIT_DATA_ENTRY(type, name, num) */ -#if defined(WITH_CYCLES_DEBUG) || defined(__KERNEL_DEBUG__) -/* DebugData memory */ -# define SPLIT_DATA_DEBUG_ENTRIES \ - SPLIT_DATA_ENTRY(DebugData, debug_data, 1) -#else -# define SPLIT_DATA_DEBUG_ENTRIES -#endif /* DEBUG */ - #ifdef __BRANCHED_PATH__ typedef ccl_global struct SplitBranchedState { @@ -80,7 +72,6 @@ typedef ccl_global struct SplitBranchedState { /* indirect loop state */ int next_closure; int next_sample; - int num_samples; #ifdef __SUBSURFACE__ int ss_next_closure; @@ -122,9 +113,7 @@ typedef ccl_global struct SplitBranchedState { #endif /* __VOLUME__ */ #define SPLIT_DATA_ENTRIES \ - SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \ SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \ - SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \ SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \ SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ @@ -133,19 +122,16 @@ typedef ccl_global struct SplitBranchedState { SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \ - SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \ + SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \ SPLIT_DATA_ENTRY(ShaderData, sd, 1) \ SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \ SPLIT_DATA_SUBSURFACE_ENTRIES \ SPLIT_DATA_VOLUME_ENTRIES \ SPLIT_DATA_BRANCHED_ENTRIES \ - SPLIT_DATA_DEBUG_ENTRIES \ /* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */ #define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \ - SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \ SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \ - SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \ SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \ SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ @@ -158,7 +144,6 @@ typedef ccl_global struct SplitBranchedState { SPLIT_DATA_SUBSURFACE_ENTRIES \ SPLIT_DATA_VOLUME_ENTRIES \ SPLIT_DATA_BRANCHED_ENTRIES \ - SPLIT_DATA_DEBUG_ENTRIES \ /* struct that holds pointers to data in the shared state buffer */ typedef struct SplitData { diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h index d5083b23f80..3b957856aea 100644 --- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h +++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h @@ -38,7 +38,6 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; ShaderData *sd = &branched_state->sd; - RNG rng = kernel_split_state.rng[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; @@ -52,14 +51,12 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) { - branched_state->lcg_state = lcg_state_init(&rng, - branched_state->path_state.rng_offset, - branched_state->path_state.sample, - 0x68bc21eb); + branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state, + 0x68bc21eb); } int num_samples = kernel_data.integrator.subsurface_samples; float num_samples_inv = 1.0f/num_samples; - RNG bssrdf_rng = cmj_hash(rng, i); + uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i); /* do subsurface scatter step with copy of shader data, this will * replace the BSSRDF with a diffuse BSDF closure */ @@ -67,7 +64,7 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect; float bssrdf_u, bssrdf_v; path_branched_rng_2D(kg, - &bssrdf_rng, + bssrdf_rng_hash, &branched_state->path_state, j, num_samples, @@ -77,7 +74,7 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it /* intersection is expensive so avoid doing multiple times for the same input */ if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) { - RNG lcg_state = branched_state->lcg_state; + uint lcg_state = branched_state->lcg_state; SubsurfaceIntersection ss_isect_private; branched_state->num_hits = subsurface_scatter_multi_intersect(kg, @@ -152,7 +149,6 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it int all = (kernel_data.integrator.sample_all_lights_direct) || (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light(kg, - &rng, bssrdf_sd, emission_sd, hit_state, @@ -229,7 +225,6 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg) if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; @@ -246,7 +241,6 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg) emission_sd, L, state, - &rng, ray, throughput, ss_indirect)) @@ -256,21 +250,17 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg) #ifdef __BRANCHED_PATH__ } else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { - float bssrdf_probability; - ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); + float bssrdf_u, bssrdf_v; + path_state_rng_2D(kg, + state, + PRNG_BSDF_U, + &bssrdf_u, &bssrdf_v); - /* modify throughput for picking bssrdf or bsdf */ - *throughput *= bssrdf_probability; + const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u); /* do bssrdf scatter step if we picked a bssrdf closure */ if(sc) { - uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb); - float bssrdf_u, bssrdf_v; - path_state_rng_2D(kg, - &rng, - state, - PRNG_BSDF_U, - &bssrdf_u, &bssrdf_v); + uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb); subsurface_scatter_step(kg, sd, state, @@ -290,7 +280,6 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg) } #endif } - kernel_split_state.rng[ray_index] = rng; } # ifdef __BRANCHED_PATH__ diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index 7704aa545c8..4268813b263 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -280,8 +280,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * float aspect = safe_sqrtf(1.0f - anisotropic * 0.9f); float r2 = roughness * roughness; - bsdf->alpha_x = fmaxf(0.001f, r2 / aspect); - bsdf->alpha_y = fmaxf(0.001f, r2 * aspect); + bsdf->alpha_x = r2 / aspect; + bsdf->alpha_y = r2 * aspect; float m_cdlum = 0.3f * base_color.x + 0.6f * base_color.y + 0.1f * base_color.z; // luminance approx. float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(0.0f, 0.0f, 0.0f); // normalize lum. to isolate hue+sat diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index 8e45dbfa5ff..6d6e92e73f6 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -16,19 +16,6 @@ CCL_NAMESPACE_BEGIN -/* Float4 textures on various devices. */ -#if defined(__KERNEL_CPU__) -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CPU -#elif defined(__KERNEL_CUDA__) -# if __CUDA_ARCH__ < 300 -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA -# else -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA_KEPLER -# endif -#else -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_OPENCL -#endif - ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha) { #ifdef __KERNEL_CPU__ @@ -50,94 +37,94 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, switch(id) { case 0: r = kernel_tex_image_interp(__tex_image_float4_000, x, y); break; - case 1: r = kernel_tex_image_interp(__tex_image_float4_001, x, y); break; - case 2: r = kernel_tex_image_interp(__tex_image_float4_002, x, y); break; - case 3: r = kernel_tex_image_interp(__tex_image_float4_003, x, y); break; - case 4: r = kernel_tex_image_interp(__tex_image_float4_004, x, y); break; - case 5: r = kernel_tex_image_interp(__tex_image_byte4_005, x, y); break; - case 6: r = kernel_tex_image_interp(__tex_image_byte4_006, x, y); break; - case 7: r = kernel_tex_image_interp(__tex_image_byte4_007, x, y); break; - case 8: r = kernel_tex_image_interp(__tex_image_byte4_008, x, y); break; + case 8: r = kernel_tex_image_interp(__tex_image_float4_008, x, y); break; + case 16: r = kernel_tex_image_interp(__tex_image_float4_016, x, y); break; + case 24: r = kernel_tex_image_interp(__tex_image_float4_024, x, y); break; + case 32: r = kernel_tex_image_interp(__tex_image_float4_032, x, y); break; + case 1: r = kernel_tex_image_interp(__tex_image_byte4_001, x, y); break; case 9: r = kernel_tex_image_interp(__tex_image_byte4_009, x, y); break; - case 10: r = kernel_tex_image_interp(__tex_image_byte4_010, x, y); break; - case 11: r = kernel_tex_image_interp(__tex_image_byte4_011, x, y); break; - case 12: r = kernel_tex_image_interp(__tex_image_byte4_012, x, y); break; - case 13: r = kernel_tex_image_interp(__tex_image_byte4_013, x, y); break; - case 14: r = kernel_tex_image_interp(__tex_image_byte4_014, x, y); break; - case 15: r = kernel_tex_image_interp(__tex_image_byte4_015, x, y); break; - case 16: r = kernel_tex_image_interp(__tex_image_byte4_016, x, y); break; case 17: r = kernel_tex_image_interp(__tex_image_byte4_017, x, y); break; - case 18: r = kernel_tex_image_interp(__tex_image_byte4_018, x, y); break; - case 19: r = kernel_tex_image_interp(__tex_image_byte4_019, x, y); break; - case 20: r = kernel_tex_image_interp(__tex_image_byte4_020, x, y); break; - case 21: r = kernel_tex_image_interp(__tex_image_byte4_021, x, y); break; - case 22: r = kernel_tex_image_interp(__tex_image_byte4_022, x, y); break; - case 23: r = kernel_tex_image_interp(__tex_image_byte4_023, x, y); break; - case 24: r = kernel_tex_image_interp(__tex_image_byte4_024, x, y); break; case 25: r = kernel_tex_image_interp(__tex_image_byte4_025, x, y); break; - case 26: r = kernel_tex_image_interp(__tex_image_byte4_026, x, y); break; - case 27: r = kernel_tex_image_interp(__tex_image_byte4_027, x, y); break; - case 28: r = kernel_tex_image_interp(__tex_image_byte4_028, x, y); break; - case 29: r = kernel_tex_image_interp(__tex_image_byte4_029, x, y); break; - case 30: r = kernel_tex_image_interp(__tex_image_byte4_030, x, y); break; - case 31: r = kernel_tex_image_interp(__tex_image_byte4_031, x, y); break; - case 32: r = kernel_tex_image_interp(__tex_image_byte4_032, x, y); break; case 33: r = kernel_tex_image_interp(__tex_image_byte4_033, x, y); break; - case 34: r = kernel_tex_image_interp(__tex_image_byte4_034, x, y); break; - case 35: r = kernel_tex_image_interp(__tex_image_byte4_035, x, y); break; - case 36: r = kernel_tex_image_interp(__tex_image_byte4_036, x, y); break; - case 37: r = kernel_tex_image_interp(__tex_image_byte4_037, x, y); break; - case 38: r = kernel_tex_image_interp(__tex_image_byte4_038, x, y); break; - case 39: r = kernel_tex_image_interp(__tex_image_byte4_039, x, y); break; - case 40: r = kernel_tex_image_interp(__tex_image_byte4_040, x, y); break; case 41: r = kernel_tex_image_interp(__tex_image_byte4_041, x, y); break; - case 42: r = kernel_tex_image_interp(__tex_image_byte4_042, x, y); break; - case 43: r = kernel_tex_image_interp(__tex_image_byte4_043, x, y); break; - case 44: r = kernel_tex_image_interp(__tex_image_byte4_044, x, y); break; - case 45: r = kernel_tex_image_interp(__tex_image_byte4_045, x, y); break; - case 46: r = kernel_tex_image_interp(__tex_image_byte4_046, x, y); break; - case 47: r = kernel_tex_image_interp(__tex_image_byte4_047, x, y); break; - case 48: r = kernel_tex_image_interp(__tex_image_byte4_048, x, y); break; case 49: r = kernel_tex_image_interp(__tex_image_byte4_049, x, y); break; - case 50: r = kernel_tex_image_interp(__tex_image_byte4_050, x, y); break; - case 51: r = kernel_tex_image_interp(__tex_image_byte4_051, x, y); break; - case 52: r = kernel_tex_image_interp(__tex_image_byte4_052, x, y); break; - case 53: r = kernel_tex_image_interp(__tex_image_byte4_053, x, y); break; - case 54: r = kernel_tex_image_interp(__tex_image_byte4_054, x, y); break; - case 55: r = kernel_tex_image_interp(__tex_image_byte4_055, x, y); break; - case 56: r = kernel_tex_image_interp(__tex_image_byte4_056, x, y); break; case 57: r = kernel_tex_image_interp(__tex_image_byte4_057, x, y); break; - case 58: r = kernel_tex_image_interp(__tex_image_byte4_058, x, y); break; - case 59: r = kernel_tex_image_interp(__tex_image_byte4_059, x, y); break; - case 60: r = kernel_tex_image_interp(__tex_image_byte4_060, x, y); break; - case 61: r = kernel_tex_image_interp(__tex_image_byte4_061, x, y); break; - case 62: r = kernel_tex_image_interp(__tex_image_byte4_062, x, y); break; - case 63: r = kernel_tex_image_interp(__tex_image_byte4_063, x, y); break; - case 64: r = kernel_tex_image_interp(__tex_image_byte4_064, x, y); break; case 65: r = kernel_tex_image_interp(__tex_image_byte4_065, x, y); break; - case 66: r = kernel_tex_image_interp(__tex_image_byte4_066, x, y); break; - case 67: r = kernel_tex_image_interp(__tex_image_byte4_067, x, y); break; - case 68: r = kernel_tex_image_interp(__tex_image_byte4_068, x, y); break; - case 69: r = kernel_tex_image_interp(__tex_image_byte4_069, x, y); break; - case 70: r = kernel_tex_image_interp(__tex_image_byte4_070, x, y); break; - case 71: r = kernel_tex_image_interp(__tex_image_byte4_071, x, y); break; - case 72: r = kernel_tex_image_interp(__tex_image_byte4_072, x, y); break; case 73: r = kernel_tex_image_interp(__tex_image_byte4_073, x, y); break; - case 74: r = kernel_tex_image_interp(__tex_image_byte4_074, x, y); break; - case 75: r = kernel_tex_image_interp(__tex_image_byte4_075, x, y); break; - case 76: r = kernel_tex_image_interp(__tex_image_byte4_076, x, y); break; - case 77: r = kernel_tex_image_interp(__tex_image_byte4_077, x, y); break; - case 78: r = kernel_tex_image_interp(__tex_image_byte4_078, x, y); break; - case 79: r = kernel_tex_image_interp(__tex_image_byte4_079, x, y); break; - case 80: r = kernel_tex_image_interp(__tex_image_byte4_080, x, y); break; case 81: r = kernel_tex_image_interp(__tex_image_byte4_081, x, y); break; - case 82: r = kernel_tex_image_interp(__tex_image_byte4_082, x, y); break; - case 83: r = kernel_tex_image_interp(__tex_image_byte4_083, x, y); break; - case 84: r = kernel_tex_image_interp(__tex_image_byte4_084, x, y); break; - case 85: r = kernel_tex_image_interp(__tex_image_byte4_085, x, y); break; - case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break; - case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break; - case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break; + case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break; + case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break; + case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break; + case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break; + case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break; + case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break; + case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break; + case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break; + case 153: r = kernel_tex_image_interp(__tex_image_byte4_153, x, y); break; + case 161: r = kernel_tex_image_interp(__tex_image_byte4_161, x, y); break; + case 169: r = kernel_tex_image_interp(__tex_image_byte4_169, x, y); break; + case 177: r = kernel_tex_image_interp(__tex_image_byte4_177, x, y); break; + case 185: r = kernel_tex_image_interp(__tex_image_byte4_185, x, y); break; + case 193: r = kernel_tex_image_interp(__tex_image_byte4_193, x, y); break; + case 201: r = kernel_tex_image_interp(__tex_image_byte4_201, x, y); break; + case 209: r = kernel_tex_image_interp(__tex_image_byte4_209, x, y); break; + case 217: r = kernel_tex_image_interp(__tex_image_byte4_217, x, y); break; + case 225: r = kernel_tex_image_interp(__tex_image_byte4_225, x, y); break; + case 233: r = kernel_tex_image_interp(__tex_image_byte4_233, x, y); break; + case 241: r = kernel_tex_image_interp(__tex_image_byte4_241, x, y); break; + case 249: r = kernel_tex_image_interp(__tex_image_byte4_249, x, y); break; + case 257: r = kernel_tex_image_interp(__tex_image_byte4_257, x, y); break; + case 265: r = kernel_tex_image_interp(__tex_image_byte4_265, x, y); break; + case 273: r = kernel_tex_image_interp(__tex_image_byte4_273, x, y); break; + case 281: r = kernel_tex_image_interp(__tex_image_byte4_281, x, y); break; + case 289: r = kernel_tex_image_interp(__tex_image_byte4_289, x, y); break; + case 297: r = kernel_tex_image_interp(__tex_image_byte4_297, x, y); break; + case 305: r = kernel_tex_image_interp(__tex_image_byte4_305, x, y); break; + case 313: r = kernel_tex_image_interp(__tex_image_byte4_313, x, y); break; + case 321: r = kernel_tex_image_interp(__tex_image_byte4_321, x, y); break; + case 329: r = kernel_tex_image_interp(__tex_image_byte4_329, x, y); break; + case 337: r = kernel_tex_image_interp(__tex_image_byte4_337, x, y); break; + case 345: r = kernel_tex_image_interp(__tex_image_byte4_345, x, y); break; + case 353: r = kernel_tex_image_interp(__tex_image_byte4_353, x, y); break; + case 361: r = kernel_tex_image_interp(__tex_image_byte4_361, x, y); break; + case 369: r = kernel_tex_image_interp(__tex_image_byte4_369, x, y); break; + case 377: r = kernel_tex_image_interp(__tex_image_byte4_377, x, y); break; + case 385: r = kernel_tex_image_interp(__tex_image_byte4_385, x, y); break; + case 393: r = kernel_tex_image_interp(__tex_image_byte4_393, x, y); break; + case 401: r = kernel_tex_image_interp(__tex_image_byte4_401, x, y); break; + case 409: r = kernel_tex_image_interp(__tex_image_byte4_409, x, y); break; + case 417: r = kernel_tex_image_interp(__tex_image_byte4_417, x, y); break; + case 425: r = kernel_tex_image_interp(__tex_image_byte4_425, x, y); break; + case 433: r = kernel_tex_image_interp(__tex_image_byte4_433, x, y); break; + case 441: r = kernel_tex_image_interp(__tex_image_byte4_441, x, y); break; + case 449: r = kernel_tex_image_interp(__tex_image_byte4_449, x, y); break; + case 457: r = kernel_tex_image_interp(__tex_image_byte4_457, x, y); break; + case 465: r = kernel_tex_image_interp(__tex_image_byte4_465, x, y); break; + case 473: r = kernel_tex_image_interp(__tex_image_byte4_473, x, y); break; + case 481: r = kernel_tex_image_interp(__tex_image_byte4_481, x, y); break; + case 489: r = kernel_tex_image_interp(__tex_image_byte4_489, x, y); break; + case 497: r = kernel_tex_image_interp(__tex_image_byte4_497, x, y); break; + case 505: r = kernel_tex_image_interp(__tex_image_byte4_505, x, y); break; + case 513: r = kernel_tex_image_interp(__tex_image_byte4_513, x, y); break; + case 521: r = kernel_tex_image_interp(__tex_image_byte4_521, x, y); break; + case 529: r = kernel_tex_image_interp(__tex_image_byte4_529, x, y); break; + case 537: r = kernel_tex_image_interp(__tex_image_byte4_537, x, y); break; + case 545: r = kernel_tex_image_interp(__tex_image_byte4_545, x, y); break; + case 553: r = kernel_tex_image_interp(__tex_image_byte4_553, x, y); break; + case 561: r = kernel_tex_image_interp(__tex_image_byte4_561, x, y); break; + case 569: r = kernel_tex_image_interp(__tex_image_byte4_569, x, y); break; + case 577: r = kernel_tex_image_interp(__tex_image_byte4_577, x, y); break; + case 585: r = kernel_tex_image_interp(__tex_image_byte4_585, x, y); break; + case 593: r = kernel_tex_image_interp(__tex_image_byte4_593, x, y); break; + case 601: r = kernel_tex_image_interp(__tex_image_byte4_601, x, y); break; + case 609: r = kernel_tex_image_interp(__tex_image_byte4_609, x, y); break; + case 617: r = kernel_tex_image_interp(__tex_image_byte4_617, x, y); break; + case 625: r = kernel_tex_image_interp(__tex_image_byte4_625, x, y); break; + case 633: r = kernel_tex_image_interp(__tex_image_byte4_633, x, y); break; + case 641: r = kernel_tex_image_interp(__tex_image_byte4_641, x, y); break; + case 649: r = kernel_tex_image_interp(__tex_image_byte4_649, x, y); break; + case 657: r = kernel_tex_image_interp(__tex_image_byte4_657, x, y); break; + case 665: r = kernel_tex_image_interp(__tex_image_byte4_665, x, y); break; default: kernel_assert(0); return make_float4(0.0f, 0.0f, 0.0f, 0.0f); @@ -224,6 +211,8 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float object_inverse_normal_transform(kg, sd, &N); /* project from direction vector to barycentric coordinates in triangles */ + float3 signed_N = N; + N.x = fabsf(N.x); N.y = fabsf(N.y); N.z = fabsf(N.z); @@ -293,12 +282,19 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float float4 f = make_float4(0.0f, 0.0f, 0.0f, 0.0f); uint use_alpha = stack_valid(alpha_offset); - if(weight.x > 0.0f) - f += weight.x*svm_image_texture(kg, id, co.y, co.z, srgb, use_alpha); - if(weight.y > 0.0f) - f += weight.y*svm_image_texture(kg, id, co.x, co.z, srgb, use_alpha); - if(weight.z > 0.0f) - f += weight.z*svm_image_texture(kg, id, co.y, co.x, srgb, use_alpha); + /* Map so that no textures are flipped, rotation is somewhat arbitrary. */ + if(weight.x > 0.0f) { + float2 uv = make_float2((signed_N.x < 0.0f)? 1.0f - co.y: co.y, co.z); + f += weight.x*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha); + } + if(weight.y > 0.0f) { + float2 uv = make_float2((signed_N.y > 0.0f)? 1.0f - co.x: co.x, co.z); + f += weight.y*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha); + } + if(weight.z > 0.0f) { + float2 uv = make_float2((signed_N.z > 0.0f)? 1.0f - co.y: co.y, co.x); + f += weight.z*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha); + } if(stack_valid(out_offset)) stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z)); diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp index 2d810ff664f..08203163d1a 100644 --- a/intern/cycles/render/graph.cpp +++ b/intern/cycles/render/graph.cpp @@ -221,28 +221,6 @@ OutputNode *ShaderGraph::output() return (OutputNode*)nodes.front(); } -ShaderGraph *ShaderGraph::copy() -{ - ShaderGraph *newgraph = new ShaderGraph(); - - /* copy nodes */ - ShaderNodeSet nodes_all; - foreach(ShaderNode *node, nodes) - nodes_all.insert(node); - - ShaderNodeMap nodes_copy; - copy_nodes(nodes_all, nodes_copy); - - /* add nodes (in same order, so output is still first) */ - newgraph->clear_nodes(); - foreach(ShaderNode *node, nodes) - newgraph->add(nodes_copy[node]); - - newgraph->simplified = simplified; - - return newgraph; -} - void ShaderGraph::connect(ShaderOutput *from, ShaderInput *to) { assert(!finalized); @@ -1040,6 +1018,9 @@ int ShaderGraph::get_num_closures() else if(CLOSURE_IS_PRINCIPLED(closure_type)) { num_closures += 8; } + else if(CLOSURE_IS_VOLUME(closure_type)) { + num_closures += VOLUME_STACK_SIZE; + } else { ++num_closures; } diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h index 72e391991a7..f0fd789c6bd 100644 --- a/intern/cycles/render/graph.h +++ b/intern/cycles/render/graph.h @@ -151,6 +151,7 @@ public: virtual bool has_surface_emission() { return false; } virtual bool has_surface_transparent() { return false; } virtual bool has_surface_bssrdf() { return false; } + virtual bool has_bump() { return false; } virtual bool has_bssrdf_bump() { return false; } virtual bool has_spatial_varying() { return false; } virtual bool has_object_dependency() { return false; } @@ -245,8 +246,6 @@ public: ShaderGraph(); ~ShaderGraph(); - ShaderGraph *copy(); - ShaderNode *add(ShaderNode *node); OutputNode *output(); diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp index f4482e0bb25..bb94b9bb82a 100644 --- a/intern/cycles/render/image.cpp +++ b/intern/cycles/render/image.cpp @@ -43,7 +43,6 @@ static bool isfinite(half /*value*/) ImageManager::ImageManager(const DeviceInfo& info) { need_update = true; - pack_images = false; osl_texture_system = NULL; animation_frame = 0; @@ -87,11 +86,6 @@ ImageManager::~ImageManager() } } -void ImageManager::set_pack_images(bool pack_images_) -{ - pack_images = pack_images_; -} - void ImageManager::set_osl_texture_system(void *texture_system) { osl_texture_system = texture_system; @@ -115,16 +109,18 @@ bool ImageManager::set_animation_frame_update(int frame) ImageDataType ImageManager::get_image_metadata(const string& filename, void *builtin_data, - bool& is_linear) + bool& is_linear, + bool& builtin_free_cache) { bool is_float = false, is_half = false; is_linear = false; + builtin_free_cache = false; int channels = 4; if(builtin_data) { if(builtin_image_info_cb) { int width, height, depth; - builtin_image_info_cb(filename, builtin_data, is_float, width, height, depth, channels); + builtin_image_info_cb(filename, builtin_data, is_float, width, height, depth, channels, builtin_free_cache); } if(is_float) { @@ -218,37 +214,14 @@ int ImageManager::max_flattened_slot(ImageDataType type) /* The lower three bits of a device texture slot number indicate its type. * These functions convert the slot ids from ImageManager "images" ones * to device ones and vice verse. - * - * There are special cases for CUDA Fermi, since there we have only 90 image texture - * slots available and should keep the flattended numbers in the 0-89 range. */ int ImageManager::type_index_to_flattened_slot(int slot, ImageDataType type) { - if(cuda_fermi_limits) { - if(type == IMAGE_DATA_TYPE_BYTE4) { - return slot + TEX_START_BYTE4_CUDA; - } - else { - return slot; - } - } - return (slot << IMAGE_DATA_TYPE_SHIFT) | (type); } int ImageManager::flattened_slot_to_type_index(int flat_slot, ImageDataType *type) { - if(cuda_fermi_limits) { - if(flat_slot >= 4) { - *type = IMAGE_DATA_TYPE_BYTE4; - return flat_slot - TEX_START_BYTE4_CUDA; - } - else { - *type = IMAGE_DATA_TYPE_FLOAT4; - return flat_slot; - } - } - *type = (ImageDataType)(flat_slot & IMAGE_DATA_TYPE_MASK); return flat_slot >> IMAGE_DATA_TYPE_SHIFT; } @@ -295,8 +268,9 @@ int ImageManager::add_image(const string& filename, { Image *img; size_t slot; + bool builtin_free_cache; - ImageDataType type = get_image_metadata(filename, builtin_data, is_linear); + ImageDataType type = get_image_metadata(filename, builtin_data, is_linear, builtin_free_cache); thread_scoped_lock device_lock(device_mutex); @@ -364,7 +338,7 @@ int ImageManager::add_image(const string& filename, else { /* Very unlikely, since max_num_images is insanely big. But better safe than sorry. */ int tex_count = 0; - for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) { + for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) { tex_count += tex_num_images[type]; } if(tex_count > max_num_images) { @@ -382,6 +356,7 @@ int ImageManager::add_image(const string& filename, img = new Image(); img->filename = filename; img->builtin_data = builtin_data; + img->builtin_free_cache = builtin_free_cache; img->need_load = true; img->animated = animated; img->frame = frame; @@ -467,7 +442,12 @@ void ImageManager::tag_reload_image(const string& filename, } } -bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components) +bool ImageManager::file_load_image_generic(Image *img, + ImageInput **in, + int &width, + int &height, + int &depth, + int &components) { if(img->filename == "") return false; @@ -506,8 +486,8 @@ bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &wid if(!builtin_image_info_cb || !builtin_image_pixels_cb) return false; - bool is_float; - builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components); + bool is_float, free_cache; + builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components, free_cache); } /* we only handle certain number of components */ @@ -542,6 +522,10 @@ bool ImageManager::file_load_image(Image *img, vector<StorageType> pixels_storage; StorageType *pixels; const size_t max_size = max(max(width, height), depth); + if(max_size == 0) { + /* Don't bother with invalid images. */ + return false; + } if(texture_limit > 0 && max_size > texture_limit) { pixels_storage.resize(((size_t)width)*height*depth*4); pixels = &pixels_storage[0]; @@ -549,6 +533,10 @@ bool ImageManager::file_load_image(Image *img, else { pixels = (StorageType*)tex_img.resize(width, height, depth); } + if(pixels == NULL) { + /* Could be that we've run out of memory. */ + return false; + } bool cmyk = false; const size_t num_pixels = ((size_t)width) * height * depth; if(in) { @@ -588,13 +576,15 @@ bool ImageManager::file_load_image(Image *img, builtin_image_float_pixels_cb(img->filename, img->builtin_data, (float*)&pixels[0], - num_pixels * components); + num_pixels * components, + img->builtin_free_cache); } else if(FileFormat == TypeDesc::UINT8) { builtin_image_pixels_cb(img->filename, img->builtin_data, (uchar*)&pixels[0], - num_pixels * components); + num_pixels * components, + img->builtin_free_cache); } else { /* TODO(dingto): Support half for ImBuf. */ @@ -754,7 +744,7 @@ void ImageManager::device_load_image(Device *device, pixels[3] = TEX_IMAGE_MISSING_A; } - if(!pack_images) { + { thread_scoped_lock device_lock(device_mutex); device->tex_alloc(name.c_str(), tex_img, @@ -783,7 +773,7 @@ void ImageManager::device_load_image(Device *device, pixels[0] = TEX_IMAGE_MISSING_R; } - if(!pack_images) { + { thread_scoped_lock device_lock(device_mutex); device->tex_alloc(name.c_str(), tex_img, @@ -815,7 +805,7 @@ void ImageManager::device_load_image(Device *device, pixels[3] = (TEX_IMAGE_MISSING_A * 255); } - if(!pack_images) { + { thread_scoped_lock device_lock(device_mutex); device->tex_alloc(name.c_str(), tex_img, @@ -843,7 +833,7 @@ void ImageManager::device_load_image(Device *device, pixels[0] = (TEX_IMAGE_MISSING_R * 255); } - if(!pack_images) { + { thread_scoped_lock device_lock(device_mutex); device->tex_alloc(name.c_str(), tex_img, @@ -874,7 +864,7 @@ void ImageManager::device_load_image(Device *device, pixels[3] = TEX_IMAGE_MISSING_A; } - if(!pack_images) { + { thread_scoped_lock device_lock(device_mutex); device->tex_alloc(name.c_str(), tex_img, @@ -902,7 +892,7 @@ void ImageManager::device_load_image(Device *device, pixels[0] = TEX_IMAGE_MISSING_R; } - if(!pack_images) { + { thread_scoped_lock device_lock(device_mutex); device->tex_alloc(name.c_str(), tex_img, @@ -1059,9 +1049,6 @@ void ImageManager::device_update(Device *device, pool.wait_work(); - if(pack_images) - device_pack_images(device, dscene, progress); - need_update = false; } @@ -1091,141 +1078,6 @@ void ImageManager::device_update_slot(Device *device, } } -uint8_t ImageManager::pack_image_options(ImageDataType type, size_t slot) -{ - uint8_t options = 0; - /* Image Options are packed into one uint: - * bit 0 -> Interpolation - * bit 1 + 2 + 3 -> Extension - */ - if(images[type][slot]->interpolation == INTERPOLATION_CLOSEST) { - options |= (1 << 0); - } - if(images[type][slot]->extension == EXTENSION_REPEAT) { - options |= (1 << 1); - } - else if(images[type][slot]->extension == EXTENSION_EXTEND) { - options |= (1 << 2); - } - else /* EXTENSION_CLIP */ { - options |= (1 << 3); - } - return options; -} - -template<typename T> -void ImageManager::device_pack_images_type( - ImageDataType type, - const vector<device_vector<T>*>& cpu_textures, - device_vector<T> *device_image, - uint4 *info) -{ - size_t size = 0, offset = 0; - /* First step is to calculate size of the texture we need. */ - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(images[type][slot] == NULL) { - continue; - } - device_vector<T>& tex_img = *cpu_textures[slot]; - size += tex_img.size(); - } - /* Now we know how much memory we need, so we can allocate and fill. */ - T *pixels = device_image->resize(size); - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(images[type][slot] == NULL) { - continue; - } - device_vector<T>& tex_img = *cpu_textures[slot]; - uint8_t options = pack_image_options(type, slot); - const int index = type_index_to_flattened_slot(slot, type) * 2; - info[index] = make_uint4(tex_img.data_width, - tex_img.data_height, - offset, - options); - info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0); - memcpy(pixels + offset, - (void*)tex_img.data_pointer, - tex_img.memory_size()); - offset += tex_img.size(); - } -} - -void ImageManager::device_pack_images(Device *device, - DeviceScene *dscene, - Progress& /*progess*/) -{ - /* For OpenCL, we pack all image textures into a single large texture, and - * do our own interpolation in the kernel. - */ - - /* TODO(sergey): This will over-allocate a bit, but this is constant memory - * so should be fine for a short term. - */ - const size_t info_size = max4(max_flattened_slot(IMAGE_DATA_TYPE_FLOAT4), - max_flattened_slot(IMAGE_DATA_TYPE_BYTE4), - max_flattened_slot(IMAGE_DATA_TYPE_FLOAT), - max_flattened_slot(IMAGE_DATA_TYPE_BYTE)); - uint4 *info = dscene->tex_image_packed_info.resize(info_size*2); - - /* Pack byte4 textures. */ - device_pack_images_type(IMAGE_DATA_TYPE_BYTE4, - dscene->tex_byte4_image, - &dscene->tex_image_byte4_packed, - info); - /* Pack float4 textures. */ - device_pack_images_type(IMAGE_DATA_TYPE_FLOAT4, - dscene->tex_float4_image, - &dscene->tex_image_float4_packed, - info); - /* Pack byte textures. */ - device_pack_images_type(IMAGE_DATA_TYPE_BYTE, - dscene->tex_byte_image, - &dscene->tex_image_byte_packed, - info); - /* Pack float textures. */ - device_pack_images_type(IMAGE_DATA_TYPE_FLOAT, - dscene->tex_float_image, - &dscene->tex_image_float_packed, - info); - - /* Push textures to the device. */ - if(dscene->tex_image_byte4_packed.size()) { - if(dscene->tex_image_byte4_packed.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(dscene->tex_image_byte4_packed); - } - device->tex_alloc("__tex_image_byte4_packed", dscene->tex_image_byte4_packed); - } - if(dscene->tex_image_float4_packed.size()) { - if(dscene->tex_image_float4_packed.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(dscene->tex_image_float4_packed); - } - device->tex_alloc("__tex_image_float4_packed", dscene->tex_image_float4_packed); - } - if(dscene->tex_image_byte_packed.size()) { - if(dscene->tex_image_byte_packed.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(dscene->tex_image_byte_packed); - } - device->tex_alloc("__tex_image_byte_packed", dscene->tex_image_byte_packed); - } - if(dscene->tex_image_float_packed.size()) { - if(dscene->tex_image_float_packed.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(dscene->tex_image_float_packed); - } - device->tex_alloc("__tex_image_float_packed", dscene->tex_image_float_packed); - } - if(dscene->tex_image_packed_info.size()) { - if(dscene->tex_image_packed_info.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(dscene->tex_image_packed_info); - } - device->tex_alloc("__tex_image_packed_info", dscene->tex_image_packed_info); - } -} - void ImageManager::device_free_builtin(Device *device, DeviceScene *dscene) { for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) { @@ -1251,18 +1103,6 @@ void ImageManager::device_free(Device *device, DeviceScene *dscene) dscene->tex_float_image.clear(); dscene->tex_byte_image.clear(); dscene->tex_half_image.clear(); - - device->tex_free(dscene->tex_image_float4_packed); - device->tex_free(dscene->tex_image_byte4_packed); - device->tex_free(dscene->tex_image_float_packed); - device->tex_free(dscene->tex_image_byte_packed); - device->tex_free(dscene->tex_image_packed_info); - - dscene->tex_image_float4_packed.clear(); - dscene->tex_image_byte4_packed.clear(); - dscene->tex_image_float_packed.clear(); - dscene->tex_image_byte_packed.clear(); - dscene->tex_image_packed_info.clear(); } CCL_NAMESPACE_END diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h index 77214bf25bc..c86d1cbedbf 100644 --- a/intern/cycles/render/image.h +++ b/intern/cycles/render/image.h @@ -57,7 +57,10 @@ public: InterpolationType interpolation, ExtensionType extension, bool use_alpha); - ImageDataType get_image_metadata(const string& filename, void *builtin_data, bool& is_linear); + ImageDataType get_image_metadata(const string& filename, + void *builtin_data, + bool& is_linear, + bool& builtin_free_cache); void device_prepare_update(DeviceScene *dscene); void device_update(Device *device, @@ -73,7 +76,6 @@ public: void device_free_builtin(Device *device, DeviceScene *dscene); void set_osl_texture_system(void *texture_system); - void set_pack_images(bool pack_images_); bool set_animation_frame_update(int frame); bool need_update; @@ -88,19 +90,23 @@ public: int &width, int &height, int &depth, - int &channels)> builtin_image_info_cb; + int &channels, + bool &free_cache)> builtin_image_info_cb; function<bool(const string &filename, void *data, unsigned char *pixels, - const size_t pixels_size)> builtin_image_pixels_cb; + const size_t pixels_size, + const bool free_cache)> builtin_image_pixels_cb; function<bool(const string &filename, void *data, float *pixels, - const size_t pixels_size)> builtin_image_float_pixels_cb; + const size_t pixels_size, + const bool free_cache)> builtin_image_float_pixels_cb; struct Image { string filename; void *builtin_data; + bool builtin_free_cache; bool use_alpha; bool need_load; @@ -123,9 +129,13 @@ private: vector<Image*> images[IMAGE_DATA_NUM_TYPES]; void *osl_texture_system; - bool pack_images; - bool file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components); + bool file_load_image_generic(Image *img, + ImageInput **in, + int &width, + int &height, + int &depth, + int &components); template<TypeDesc::BASETYPE FileFormat, typename StorageType, @@ -140,8 +150,6 @@ private: int flattened_slot_to_type_index(int flat_slot, ImageDataType *type); string name_from_type(int type); - uint8_t pack_image_options(ImageDataType type, size_t slot); - void device_load_image(Device *device, DeviceScene *dscene, Scene *scene, @@ -152,17 +160,6 @@ private: DeviceScene *dscene, ImageDataType type, int slot); - - template<typename T> - void device_pack_images_type( - ImageDataType type, - const vector<device_vector<T>*>& cpu_textures, - device_vector<T> *device_image, - uint4 *info); - - void device_pack_images(Device *device, - DeviceScene *dscene, - Progress& progess); }; CCL_NAMESPACE_END diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp index a004bb5b856..15b728d6e02 100644 --- a/intern/cycles/render/integrator.cpp +++ b/intern/cycles/render/integrator.cpp @@ -31,7 +31,6 @@ NODE_DEFINE(Integrator) { NodeType *type = NodeType::add("integrator", create); - SOCKET_INT(min_bounce, "Min Bounce", 2); SOCKET_INT(max_bounce, "Max Bounce", 7); SOCKET_INT(max_diffuse_bounce, "Max Diffuse Bounce", 7); @@ -39,9 +38,7 @@ NODE_DEFINE(Integrator) SOCKET_INT(max_transmission_bounce, "Max Transmission Bounce", 7); SOCKET_INT(max_volume_bounce, "Max Volume Bounce", 7); - SOCKET_INT(transparent_min_bounce, "Transparent Min Bounce", 2); SOCKET_INT(transparent_max_bounce, "Transparent Max Bounce", 7); - SOCKET_BOOLEAN(transparent_shadows, "Transparent Shadows", false); SOCKET_INT(ao_bounces, "AO Bounces", 0); @@ -104,7 +101,6 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene /* integrator parameters */ kintegrator->max_bounce = max_bounce + 1; - kintegrator->min_bounce = min_bounce + 1; kintegrator->max_diffuse_bounce = max_diffuse_bounce + 1; kintegrator->max_glossy_bounce = max_glossy_bounce + 1; @@ -112,7 +108,6 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene kintegrator->max_volume_bounce = max_volume_bounce + 1; kintegrator->transparent_max_bounce = transparent_max_bounce + 1; - kintegrator->transparent_min_bounce = transparent_min_bounce + 1; if(ao_bounces == 0) { kintegrator->ao_bounces = INT_MAX; @@ -125,19 +120,14 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene * We only need to enable transparent shadows, if we actually have * transparent shaders in the scene. Otherwise we can disable it * to improve performance a bit. */ - if(transparent_shadows) { - kintegrator->transparent_shadows = false; - foreach(Shader *shader, scene->shaders) { - /* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */ - if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) { - kintegrator->transparent_shadows = true; - break; - } + kintegrator->transparent_shadows = false; + foreach(Shader *shader, scene->shaders) { + /* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */ + if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) { + kintegrator->transparent_shadows = true; + break; } } - else { - kintegrator->transparent_shadows = false; - } kintegrator->volume_max_steps = volume_max_steps; kintegrator->volume_step_size = volume_step_size; diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h index 9501d7f8416..3cb430d72b4 100644 --- a/intern/cycles/render/integrator.h +++ b/intern/cycles/render/integrator.h @@ -31,7 +31,6 @@ class Integrator : public Node { public: NODE_DECLARE - int min_bounce; int max_bounce; int max_diffuse_bounce; @@ -39,9 +38,7 @@ public: int max_transmission_bounce; int max_volume_bounce; - int transparent_min_bounce; int transparent_max_bounce; - bool transparent_shadows; int ao_bounces; diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp index 93d88c5642c..4adc00bc839 100644 --- a/intern/cycles/render/light.cpp +++ b/intern/cycles/render/light.cpp @@ -225,17 +225,13 @@ void LightManager::disable_ineffective_light(Device *device, Scene *scene) bool LightManager::object_usable_as_light(Object *object) { Mesh *mesh = object->mesh; /* Skip objects with NaNs */ - if (!object->bounds.valid()) { + if(!object->bounds.valid()) { return false; } /* Skip if we are not visible for BSDFs. */ if(!(object->visibility & (PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_TRANSMIT))) { return false; } - /* Skip motion blurred deforming meshes, not supported yet. */ - if(mesh->has_motion_blur()) { - return false; - } /* Skip if we have no emission shaders. */ /* TODO(sergey): Ideally we want to avoid such duplicated loop, since it'll * iterate all mesh shaders twice (when counting and when calculating diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp index 03825f780e0..84537bf5993 100644 --- a/intern/cycles/render/mesh.cpp +++ b/intern/cycles/render/mesh.cpp @@ -1925,16 +1925,7 @@ void MeshManager::device_update_displacement_images(Device *device, if(node->special_type != SHADER_SPECIAL_TYPE_IMAGE_SLOT) { continue; } - if(device->info.pack_images) { - /* If device requires packed images we need to update all - * images now, even if they're not used for displacement. - */ - image_manager->device_update(device, - dscene, - scene, - progress); - return; - } + ImageSlotTextureNode *image_node = static_cast<ImageSlotTextureNode*>(node); int slot = image_node->slot; if(slot != -1) { diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp index 86e25df1da3..2b682756c6a 100644 --- a/intern/cycles/render/nodes.cpp +++ b/intern/cycles/render/nodes.cpp @@ -365,7 +365,8 @@ void ImageTextureNode::compile(OSLCompiler& compiler) if(is_float == -1) { if(builtin_data == NULL) { ImageDataType type; - type = image_manager->get_image_metadata(filename.string(), NULL, is_linear); + bool builtin_free_cache; + type = image_manager->get_image_metadata(filename.string(), NULL, is_linear, builtin_free_cache); if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4) is_float = 1; } @@ -554,7 +555,8 @@ void EnvironmentTextureNode::compile(OSLCompiler& compiler) if(is_float == -1) { if(builtin_data == NULL) { ImageDataType type; - type = image_manager->get_image_metadata(filename.string(), NULL, is_linear); + bool builtin_free_cache; + type = image_manager->get_image_metadata(filename.string(), NULL, is_linear, builtin_free_cache); if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4) is_float = 1; } @@ -1799,6 +1801,14 @@ BsdfBaseNode::BsdfBaseNode(const NodeType *node_type) special_type = SHADER_SPECIAL_TYPE_CLOSURE; } +bool BsdfBaseNode::has_bump() +{ + /* detect if anything is plugged into the normal input besides the default */ + ShaderInput *normal_in = input("Normal"); + return (normal_in && normal_in->link && + normal_in->link->parent->special_type != SHADER_SPECIAL_TYPE_GEOMETRY); +} + /* BSDF Closure */ BsdfNode::BsdfNode(const NodeType *node_type) @@ -2437,9 +2447,7 @@ void PrincipledBsdfNode::compile(OSLCompiler& compiler) bool PrincipledBsdfNode::has_bssrdf_bump() { - /* detect if anything is plugged into the normal input besides the default */ - ShaderInput *normal_in = input("Normal"); - return (normal_in->link && normal_in->link->parent->special_type != SHADER_SPECIAL_TYPE_GEOMETRY); + return has_surface_bssrdf() && has_bump(); } /* Translucent BSDF Closure */ diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h index c0271a3c8eb..ec4c7c7c50d 100644 --- a/intern/cycles/render/nodes.h +++ b/intern/cycles/render/nodes.h @@ -326,6 +326,16 @@ class BsdfBaseNode : public ShaderNode { public: BsdfBaseNode(const NodeType *node_type); + bool has_spatial_varying() { return true; } + virtual ClosureType get_closure_type() { return closure; } + virtual bool has_bump(); + + virtual bool equals(const ShaderNode& /*other*/) + { + /* TODO(sergey): With some care BSDF nodes can be de-duplicated. */ + return false; + } + ClosureType closure; }; @@ -334,19 +344,11 @@ public: explicit BsdfNode(const NodeType *node_type); SHADER_NODE_BASE_CLASS(BsdfNode) - bool has_spatial_varying() { return true; } void compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3 = NULL, ShaderInput *param4 = NULL); - virtual ClosureType get_closure_type() { return closure; } float3 color; float3 normal; float surface_mix_weight; - - virtual bool equals(const ShaderNode& /*other*/) - { - /* TODO(sergey): With some care BSDF nodes can be de-duplicated. */ - return false; - } }; class AnisotropicBsdfNode : public BsdfNode { @@ -373,7 +375,6 @@ class PrincipledBsdfNode : public BsdfBaseNode { public: SHADER_NODE_CLASS(PrincipledBsdfNode) - bool has_spatial_varying() { return true; } bool has_surface_bssrdf(); bool has_bssrdf_bump(); void compile(SVMCompiler& compiler, ShaderInput *metallic, ShaderInput *subsurface, ShaderInput *subsurface_radius, @@ -390,13 +391,6 @@ public: float surface_mix_weight; ClosureType distribution, distribution_orig; - virtual bool equals(const ShaderNode * /*other*/) - { - /* TODO(sergey): With some care BSDF nodes can be de-duplicated. */ - return false; - } - - ClosureType get_closure_type() { return closure; } bool has_integrator_dependency(); void attributes(Shader *shader, AttributeRequestSet *attributes); }; diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp index 375abfeb27a..12690090066 100644 --- a/intern/cycles/render/object.cpp +++ b/intern/cycles/render/object.cpp @@ -262,6 +262,17 @@ bool Object::is_traceable() return true; } +uint Object::visibility_for_tracing() const { + uint trace_visibility = visibility; + if (is_shadow_catcher) { + trace_visibility &= ~PATH_RAY_SHADOW_NON_CATCHER; + } + else { + trace_visibility &= ~PATH_RAY_SHADOW_CATCHER; + } + return trace_visibility; +} + /* Object Manager */ ObjectManager::ObjectManager() @@ -356,6 +367,13 @@ void ObjectManager::device_update_object_transform(UpdateObejctTransformState *s /* OBJECT_PROPERTIES */ objects[offset+8] = make_float4(surface_area, pass_id, random_number, __int_as_float(particle_index)); + if(mesh->use_motion_blur) { + state->have_motion = true; + } + if(mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) { + flag |= SD_OBJECT_HAS_VERTEX_MOTION; + } + if(state->need_motion == Scene::MOTION_PASS) { /* Motion transformations, is world/object space depending if mesh * comes with deformed position in object space, or if we transform @@ -376,9 +394,6 @@ void ObjectManager::device_update_object_transform(UpdateObejctTransformState *s mtfm.pre = mtfm.pre * itfm; mtfm.post = mtfm.post * itfm; } - else { - flag |= SD_OBJECT_HAS_VERTEX_MOTION; - } memcpy(&objects_vector[object_index*OBJECT_VECTOR_SIZE+0], &mtfm.pre, sizeof(float4)*3); memcpy(&objects_vector[object_index*OBJECT_VECTOR_SIZE+3], &mtfm.post, sizeof(float4)*3); @@ -397,10 +412,6 @@ void ObjectManager::device_update_object_transform(UpdateObejctTransformState *s } #endif - if(mesh->use_motion_blur) { - state->have_motion = true; - } - /* Dupli object coords and motion info. */ int totalsteps = mesh->motion_steps; int numsteps = (totalsteps - 1)/2; diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h index 12d7b2c81cf..6927bbfe4c7 100644 --- a/intern/cycles/render/object.h +++ b/intern/cycles/render/object.h @@ -60,7 +60,7 @@ public: ParticleSystem *particle_system; int particle_index; - + Object(); ~Object(); @@ -75,6 +75,11 @@ public: * kernel scene. */ bool is_traceable(); + + /* Combine object's visibility with all possible internal run-time + * determined flags which denotes trace-time visibility. + */ + uint visibility_for_tracing() const; }; /* Object Manager */ diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp index a794f233718..5c5ac6e2be9 100644 --- a/intern/cycles/render/osl.cpp +++ b/intern/cycles/render/osl.cpp @@ -233,8 +233,10 @@ void OSLShaderManager::shading_system_init() "glossy", /* PATH_RAY_GLOSSY */ "singular", /* PATH_RAY_SINGULAR */ "transparent", /* PATH_RAY_TRANSPARENT */ - "shadow", /* PATH_RAY_SHADOW_OPAQUE */ - "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */ + "shadow", /* PATH_RAY_SHADOW_OPAQUE_NON_CATCHER */ + "shadow", /* PATH_RAY_SHADOW_OPAQUE_CATCHER */ + "shadow", /* PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER */ + "shadow", /* PATH_RAY_SHADOW_TRANSPARENT_CATCHER */ "__unused__", "__unused__", @@ -719,6 +721,7 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath) current_shader->has_surface_bssrdf = true; current_shader->has_bssrdf_bump = true; /* can't detect yet */ } + current_shader->has_bump = true; /* can't detect yet */ } if(node->has_spatial_varying()) { @@ -1027,6 +1030,9 @@ void OSLCompiler::generate_nodes(const ShaderNodeSet& nodes) if(node->has_bssrdf_bump()) current_shader->has_bssrdf_bump = true; } + if(node->has_bump()) { + current_shader->has_bump = true; + } } else if(current_type == SHADER_TYPE_VOLUME) { if(node->has_spatial_varying()) @@ -1089,21 +1095,14 @@ void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader) ShaderGraph *graph = shader->graph; ShaderNode *output = (graph)? graph->output(): NULL; - /* copy graph for shader with bump mapping */ - if(output->input("Surface")->link && output->input("Displacement")->link) - if(!shader->graph_bump) - shader->graph_bump = shader->graph->copy(); + bool has_bump = (shader->displacement_method != DISPLACE_TRUE) && + output->input("Surface")->link && output->input("Displacement")->link; /* finalize */ shader->graph->finalize(scene, - false, - shader->has_integrator_dependency); - if(shader->graph_bump) { - shader->graph_bump->finalize(scene, - true, - shader->has_integrator_dependency, - shader->displacement_method == DISPLACE_BOTH); - } + has_bump, + shader->has_integrator_dependency, + shader->displacement_method == DISPLACE_BOTH); current_shader = shader; @@ -1111,7 +1110,8 @@ void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader) shader->has_surface_emission = false; shader->has_surface_transparent = false; shader->has_surface_bssrdf = false; - shader->has_bssrdf_bump = false; + shader->has_bump = has_bump; + shader->has_bssrdf_bump = has_bump; shader->has_volume = false; shader->has_displacement = false; shader->has_surface_spatial_varying = false; @@ -1123,8 +1123,8 @@ void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader) if(shader->used && graph && output->input("Surface")->link) { shader->osl_surface_ref = compile_type(shader, shader->graph, SHADER_TYPE_SURFACE); - if(shader->graph_bump && shader->displacement_method != DISPLACE_TRUE) - shader->osl_surface_bump_ref = compile_type(shader, shader->graph_bump, SHADER_TYPE_BUMP); + if(has_bump) + shader->osl_surface_bump_ref = compile_type(shader, shader->graph, SHADER_TYPE_BUMP); else shader->osl_surface_bump_ref = OSL::ShaderGroupRef(); diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp index 4db20338744..c59a5d97df5 100644 --- a/intern/cycles/render/scene.cpp +++ b/intern/cycles/render/scene.cpp @@ -148,8 +148,6 @@ void Scene::device_update(Device *device_, Progress& progress) * - Film needs light manager to run for use_light_visibility * - Lookup tables are done a second time to handle film tables */ - - image_manager->set_pack_images(device->info.pack_images); progress.set_status("Updating Shaders"); shader_manager->device_update(device, &dscene, this, progress); diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h index 4c2c4f5fcc3..0194327f567 100644 --- a/intern/cycles/render/scene.h +++ b/intern/cycles/render/scene.h @@ -121,13 +121,6 @@ public: vector<device_vector<uchar>* > tex_byte_image; vector<device_vector<half>* > tex_half_image; - /* opencl images */ - device_vector<float4> tex_image_float4_packed; - device_vector<uchar4> tex_image_byte4_packed; - device_vector<float> tex_image_float_packed; - device_vector<uchar> tex_image_byte_packed; - device_vector<uint4> tex_image_packed_info; - KernelData data; }; diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index ae462a1084a..f68efe38add 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -46,7 +46,7 @@ Session::Session(const SessionParams& params_) : params(params_), tile_manager(params.progressive, params.samples, params.tile_size, params.start_resolution, params.background == false || params.progressive_refine, params.background, params.tile_order, - max(params.device.multi_devices.size(), 1)), + max(params.device.multi_devices.size(), 1), params.pixel_size), stats() { device_use_gl = ((params.device.type != DEVICE_CPU) && !params.background); @@ -721,7 +721,6 @@ DeviceRequestedFeatures Session::get_requested_device_features() BakeManager *bake_manager = scene->bake_manager; requested_features.use_baking = bake_manager->get_baking(); requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH); - requested_features.use_transparent &= scene->integrator->transparent_shadows; requested_features.use_denoising = params.use_denoising; return requested_features; @@ -931,7 +930,7 @@ void Session::update_status_time(bool show_pause, bool show_done) const bool rendering_finished = (tile == num_tiles); const bool is_last_tile = (tile + 1) == num_tiles; - substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles); + substatus = string_printf("Rendered %d/%d Tiles", tile, num_tiles); if(!rendering_finished && (device->show_samples() || (is_cpu && is_last_tile))) { /* Some devices automatically support showing the sample number: @@ -961,6 +960,7 @@ void Session::update_status_time(bool show_pause, bool show_done) } else if(show_done) { status = "Done"; + progress.set_end_time(); /* Save end time so that further calls to get_time are accurate. */ } else { status = substatus; diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index 9f8bb8c42fa..980eda0876d 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -53,6 +53,7 @@ public: int2 tile_size; TileOrder tile_order; int start_resolution; + int pixel_size; int threads; bool display_buffer_linear; @@ -81,6 +82,7 @@ public: samples = INT_MAX; tile_size = make_int2(64, 64); start_resolution = INT_MAX; + pixel_size = 1; threads = 0; use_denoising = false; @@ -110,6 +112,7 @@ public: && experimental == params.experimental && tile_size == params.tile_size && start_resolution == params.start_resolution + && pixel_size == params.pixel_size && threads == params.threads && display_buffer_linear == params.display_buffer_linear && cancel_timeout == params.cancel_timeout diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp index 50400edd5ca..86378dfb495 100644 --- a/intern/cycles/render/shader.cpp +++ b/intern/cycles/render/shader.cpp @@ -177,7 +177,6 @@ Shader::Shader() pass_id = 0; graph = NULL; - graph_bump = NULL; has_surface = false; has_surface_transparent = false; @@ -185,11 +184,13 @@ Shader::Shader() has_surface_bssrdf = false; has_volume = false; has_displacement = false; + has_bump = false; has_bssrdf_bump = false; has_surface_spatial_varying = false; has_volume_spatial_varying = false; has_object_dependency = false; has_integrator_dependency = false; + has_volume_connected = false; displacement_method = DISPLACE_BUMP; @@ -203,7 +204,6 @@ Shader::Shader() Shader::~Shader() { delete graph; - delete graph_bump; } bool Shader::is_constant_emission(float3 *emission) @@ -238,9 +238,7 @@ void Shader::set_graph(ShaderGraph *graph_) /* assign graph */ delete graph; - delete graph_bump; graph = graph_; - graph_bump = NULL; /* Store info here before graph optimization to make sure that * nodes that get optimized away still count. */ @@ -457,15 +455,11 @@ void ShaderManager::device_update_common(Device *device, flag |= SD_VOLUME_MIS; if(shader->volume_interpolation_method == VOLUME_INTERPOLATION_CUBIC) flag |= SD_VOLUME_CUBIC; - if(shader->graph_bump) + if(shader->has_bump) flag |= SD_HAS_BUMP; if(shader->displacement_method != DISPLACE_BUMP) flag |= SD_HAS_DISPLACEMENT; - /* shader with bump mapping */ - if(shader->displacement_method != DISPLACE_TRUE && shader->graph_bump) - flag |= SD_HAS_BSSRDF_BUMP; - /* constant emission check */ float3 constant_emission = make_float3(0.0f, 0.0f, 0.0f); if(shader->is_constant_emission(&constant_emission)) @@ -502,9 +496,7 @@ void ShaderManager::device_update_common(Device *device, KernelIntegrator *kintegrator = &dscene->data.integrator; kintegrator->use_volumes = has_volumes; /* TODO(sergey): De-duplicate with flags set in integrator.cpp. */ - if(scene->integrator->transparent_shadows) { - kintegrator->transparent_shadows = has_transparent_shadow; - } + kintegrator->transparent_shadows = has_transparent_shadow; } void ShaderManager::device_free_common(Device *device, DeviceScene *dscene, Scene *scene) @@ -609,11 +601,6 @@ void ShaderManager::get_requested_features(Scene *scene, Shader *shader = scene->shaders[i]; /* Gather requested features from all the nodes from the graph nodes. */ get_requested_graph_features(shader->graph, requested_features); - /* Gather requested features from the graph itself. */ - if(shader->graph_bump) { - get_requested_graph_features(shader->graph_bump, - requested_features); - } ShaderNode *output_node = shader->graph->output(); if(output_node->input("Displacement")->link != NULL) { requested_features->nodes_features |= NODE_FEATURE_BUMP; diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h index b6714b13247..79a67d6756a 100644 --- a/intern/cycles/render/shader.h +++ b/intern/cycles/render/shader.h @@ -89,11 +89,6 @@ public: /* shader graph */ ShaderGraph *graph; - /* shader graph with auto bump mapping included, we compile two shaders, - * with and without bump, because the displacement method is a mesh - * level setting, so we need to handle both */ - ShaderGraph *graph_bump; - /* sampling */ bool use_mis; bool use_transparent_shadow; @@ -121,6 +116,7 @@ public: bool has_volume; bool has_displacement; bool has_surface_bssrdf; + bool has_bump; bool has_bssrdf_bump; bool has_surface_spatial_varying; bool has_volume_spatial_varying; diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp index 48287d872d4..32f89897970 100644 --- a/intern/cycles/render/svm.cpp +++ b/intern/cycles/render/svm.cpp @@ -521,6 +521,9 @@ void SVMCompiler::generate_closure_node(ShaderNode *node, if(node->has_bssrdf_bump()) current_shader->has_bssrdf_bump = true; } + if(node->has_bump()) { + current_shader->has_bump = true; + } } } @@ -799,29 +802,21 @@ void SVMCompiler::compile(Scene *scene, Summary *summary) { /* copy graph for shader with bump mapping */ - ShaderNode *node = shader->graph->output(); + ShaderNode *output = shader->graph->output(); int start_num_svm_nodes = svm_nodes.size(); const double time_start = time_dt(); - if(node->input("Surface")->link && node->input("Displacement")->link) - if(!shader->graph_bump) - shader->graph_bump = shader->graph->copy(); + bool has_bump = (shader->displacement_method != DISPLACE_TRUE) && + output->input("Surface")->link && output->input("Displacement")->link; /* finalize */ { scoped_timer timer((summary != NULL)? &summary->time_finalize: NULL); shader->graph->finalize(scene, - false, - shader->has_integrator_dependency); - } - - if(shader->graph_bump) { - scoped_timer timer((summary != NULL)? &summary->time_finalize_bump: NULL); - shader->graph_bump->finalize(scene, - true, - shader->has_integrator_dependency, - shader->displacement_method == DISPLACE_BOTH); + has_bump, + shader->has_integrator_dependency, + shader->displacement_method == DISPLACE_BOTH); } current_shader = shader; @@ -830,7 +825,8 @@ void SVMCompiler::compile(Scene *scene, shader->has_surface_emission = false; shader->has_surface_transparent = false; shader->has_surface_bssrdf = false; - shader->has_bssrdf_bump = false; + shader->has_bump = has_bump; + shader->has_bssrdf_bump = has_bump; shader->has_volume = false; shader->has_displacement = false; shader->has_surface_spatial_varying = false; @@ -839,9 +835,9 @@ void SVMCompiler::compile(Scene *scene, shader->has_integrator_dependency = false; /* generate bump shader */ - if(shader->displacement_method != DISPLACE_TRUE && shader->graph_bump) { + if(has_bump) { scoped_timer timer((summary != NULL)? &summary->time_generate_bump: NULL); - compile_type(shader, shader->graph_bump, SHADER_TYPE_BUMP); + compile_type(shader, shader->graph, SHADER_TYPE_BUMP); svm_nodes[index].y = svm_nodes.size(); svm_nodes.insert(svm_nodes.end(), current_svm_nodes.begin(), @@ -853,7 +849,7 @@ void SVMCompiler::compile(Scene *scene, scoped_timer timer((summary != NULL)? &summary->time_generate_surface: NULL); compile_type(shader, shader->graph, SHADER_TYPE_SURFACE); /* only set jump offset if there's no bump shader, as the bump shader will fall thru to this one if it exists */ - if(shader->displacement_method == DISPLACE_TRUE || !shader->graph_bump) { + if(!has_bump) { svm_nodes[index].y = svm_nodes.size(); } svm_nodes.insert(svm_nodes.end(), @@ -895,7 +891,6 @@ SVMCompiler::Summary::Summary() : num_svm_nodes(0), peak_stack_usage(0), time_finalize(0.0), - time_finalize_bump(0.0), time_generate_surface(0.0), time_generate_bump(0.0), time_generate_volume(0.0), @@ -911,10 +906,7 @@ string SVMCompiler::Summary::full_report() const report += string_printf("Peak stack usage: %d\n", peak_stack_usage); report += string_printf("Time (in seconds):\n"); - report += string_printf(" Finalize: %f\n", time_finalize); - report += string_printf(" Bump finalize: %f\n", time_finalize_bump); - report += string_printf("Finalize: %f\n", time_finalize + - time_finalize_bump); + report += string_printf("Finalize: %f\n", time_finalize); report += string_printf(" Surface: %f\n", time_generate_surface); report += string_printf(" Bump: %f\n", time_generate_bump); report += string_printf(" Volume: %f\n", time_generate_volume); diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h index abbd9e50610..98ef5fa05d8 100644 --- a/intern/cycles/render/svm.h +++ b/intern/cycles/render/svm.h @@ -74,9 +74,6 @@ public: /* Time spent on surface graph finalization. */ double time_finalize; - /* Time spent on bump graph finalization. */ - double time_finalize_bump; - /* Time spent on generating SVM nodes for surface shader. */ double time_generate_surface; diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp index 176a1f4f0f3..a9620f79fa0 100644 --- a/intern/cycles/render/tile.cpp +++ b/intern/cycles/render/tile.cpp @@ -88,12 +88,14 @@ enum SpiralDirection { } /* namespace */ TileManager::TileManager(bool progressive_, int num_samples_, int2 tile_size_, int start_resolution_, - bool preserve_tile_device_, bool background_, TileOrder tile_order_, int num_devices_) + bool preserve_tile_device_, bool background_, TileOrder tile_order_, + int num_devices_, int pixel_size_) { progressive = progressive_; tile_size = tile_size_; tile_order = tile_order_; start_resolution = start_resolution_; + pixel_size = pixel_size_; num_samples = num_samples_; num_devices = num_devices_; preserve_tile_device = preserve_tile_device_; @@ -163,15 +165,17 @@ void TileManager::set_samples(int num_samples_) uint64_t pixel_samples = 0; /* While rendering in the viewport, the initial preview resolution is increased to the native resolution * before the actual rendering begins. Therefore, additional pixel samples will be rendered. */ - int divider = get_divider(params.width, params.height, start_resolution) / 2; - while(divider > 1) { + int divider = max(get_divider(params.width, params.height, start_resolution) / 2, pixel_size); + while(divider > pixel_size) { int image_w = max(1, params.width/divider); int image_h = max(1, params.height/divider); pixel_samples += image_w * image_h; divider >>= 1; } - state.total_pixel_samples = pixel_samples + (uint64_t)get_num_effective_samples() * params.width*params.height; + int image_w = max(1, params.width/divider); + int image_h = max(1, params.height/divider); + state.total_pixel_samples = pixel_samples + (uint64_t)get_num_effective_samples() * image_w*image_h; if(schedule_denoising) { state.total_pixel_samples += params.width*params.height; } @@ -471,7 +475,7 @@ bool TileManager::done() int end_sample = (range_num_samples == -1) ? num_samples : range_start_sample + range_num_samples; - return (state.resolution_divider == 1) && + return (state.resolution_divider == pixel_size) && (state.sample+state.num_samples >= end_sample); } @@ -480,9 +484,9 @@ bool TileManager::next() if(done()) return false; - if(progressive && state.resolution_divider > 1) { + if(progressive && state.resolution_divider > pixel_size) { state.sample = 0; - state.resolution_divider /= 2; + state.resolution_divider = max(state.resolution_divider/2, pixel_size); state.num_samples = 1; set_tiles(); } @@ -496,7 +500,7 @@ bool TileManager::next() else state.num_samples = range_num_samples; - state.resolution_divider = 1; + state.resolution_divider = pixel_size; set_tiles(); } diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h index e39a8f0627a..4cd57b7b30c 100644 --- a/intern/cycles/render/tile.h +++ b/intern/cycles/render/tile.h @@ -88,7 +88,7 @@ public: int num_samples; TileManager(bool progressive, int num_samples, int2 tile_size, int start_resolution, - bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1); + bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1, int pixel_size = 1); ~TileManager(); void free_device(); @@ -122,6 +122,7 @@ protected: int2 tile_size; TileOrder tile_order; int start_resolution; + int pixel_size; int num_devices; /* in some cases it is important that the same tile will be returned for the same diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 43f9a57d099..7f3747a0f58 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -38,6 +38,7 @@ set(SRC_HEADERS util_atomic.h util_boundbox.h util_debug.h + util_defines.h util_guarded_allocator.cpp util_foreach.h util_function.h diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h index 643af87a65f..f3c7ae546a0 100644 --- a/intern/cycles/util/util_atomic.h +++ b/intern/cycles/util/util_atomic.h @@ -22,16 +22,6 @@ /* Using atomic ops header from Blender. */ #include "atomic_ops.h" -ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value) -{ - size_t prev_value = *maximum_value; - while(prev_value < value) { - if(atomic_cas_z(maximum_value, prev_value, value) != prev_value) { - break; - } - } -} - #define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x)) #define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1) diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index 10895f2e918..eb078d69252 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -122,13 +122,16 @@ void DebugFlags::OpenCL::reset() } DebugFlags::DebugFlags() +: viewport_static_bvh(false) { /* Nothing for now. */ } void DebugFlags::reset() { + viewport_static_bvh = false; cpu.reset(); + cuda.reset(); opencl.reset(); } diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index 450cd900a9f..9255279c5ab 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -30,6 +30,9 @@ CCL_NAMESPACE_BEGIN */ class DebugFlags { public: + /* Use static BVH in viewport, to match final render exactly. */ + bool viewport_static_bvh; + /* Descriptor of CPU feature-set to be used. */ struct CPU { CPU(); diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h new file mode 100644 index 00000000000..ae654092c87 --- /dev/null +++ b/intern/cycles/util/util_defines.h @@ -0,0 +1,135 @@ + +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_DEFINES_H__ +#define __UTIL_DEFINES_H__ + +/* Bitness */ + +#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) +# define __KERNEL_64_BIT__ +#endif + +/* Qualifiers for kernel code shared by CPU and GPU */ + +#ifndef __KERNEL_GPU__ +# define ccl_device static inline +# define ccl_device_noinline static +# define ccl_global +# define ccl_constant +# define ccl_local +# define ccl_local_param +# define ccl_private +# define ccl_restrict __restrict +# define ccl_ref & +# define __KERNEL_WITH_SSE_ALIGN__ + +# if defined(_WIN32) && !defined(FREE_WINDOWS) +# define ccl_device_inline static __forceinline +# define ccl_device_forceinline static __forceinline +# define ccl_align(...) __declspec(align(__VA_ARGS__)) +# ifdef __KERNEL_64_BIT__ +# define ccl_try_align(...) __declspec(align(__VA_ARGS__)) +# else /* __KERNEL_64_BIT__ */ +# undef __KERNEL_WITH_SSE_ALIGN__ +/* No support for function arguments (error C2719). */ +# define ccl_try_align(...) +# endif /* __KERNEL_64_BIT__ */ +# define ccl_may_alias +# define ccl_always_inline __forceinline +# define ccl_never_inline __declspec(noinline) +# define ccl_maybe_unused +# else /* _WIN32 && !FREE_WINDOWS */ +# define ccl_device_inline static inline __attribute__((always_inline)) +# define ccl_device_forceinline static inline __attribute__((always_inline)) +# define ccl_align(...) __attribute__((aligned(__VA_ARGS__))) +# ifndef FREE_WINDOWS64 +# define __forceinline inline __attribute__((always_inline)) +# endif +# define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) +# define ccl_may_alias __attribute__((__may_alias__)) +# define ccl_always_inline __attribute__((always_inline)) +# define ccl_never_inline __attribute__((noinline)) +# define ccl_maybe_unused __attribute__((used)) +# endif /* _WIN32 && !FREE_WINDOWS */ + +/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */ +# if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */ +# define ATTR_FALLTHROUGH __attribute__((fallthrough)) +# else +# define ATTR_FALLTHROUGH ((void)0) +# endif +#endif /* __KERNEL_GPU__ */ + +/* macros */ + +/* hints for branch prediction, only use in code that runs a _lot_ */ +#if defined(__GNUC__) && defined(__KERNEL_CPU__) +# define LIKELY(x) __builtin_expect(!!(x), 1) +# define UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +# define LIKELY(x) (x) +# define UNLIKELY(x) (x) +#endif + +#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800)) +# define HAS_CPP11_FEATURES +#endif + +#if defined(__GNUC__) || defined(__clang__) +# if defined(HAS_CPP11_FEATURES) +/* Some magic to be sure we don't have reference in the type. */ +template<typename T> static inline T decltype_helper(T x) { return x; } +# define TYPEOF(x) decltype(decltype_helper(x)) +# else +# define TYPEOF(x) typeof(x) +# endif +#endif + +/* Causes warning: + * incompatible types when assigning to type 'Foo' from type 'Bar' + * ... the compiler optimizes away the temp var */ +#ifdef __GNUC__ +#define CHECK_TYPE(var, type) { \ + TYPEOF(var) *__tmp; \ + __tmp = (type *)NULL; \ + (void)__tmp; \ +} (void)0 + +#define CHECK_TYPE_PAIR(var_a, var_b) { \ + TYPEOF(var_a) *__tmp; \ + __tmp = (typeof(var_b) *)NULL; \ + (void)__tmp; \ +} (void)0 +#else +# define CHECK_TYPE(var, type) +# define CHECK_TYPE_PAIR(var_a, var_b) +#endif + +/* can be used in simple macros */ +#define CHECK_TYPE_INLINE(val, type) \ + ((void)(((type)0) != (val))) + +#ifndef __KERNEL_GPU__ +# include <cassert> +# define util_assert(statement) assert(statement) +#else +# define util_assert(statement) +#endif + +#endif /* __UTIL_DEFINES_H__ */ + diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index b719640b19c..fb04d49bcd9 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -94,6 +94,7 @@ ccl_device_inline float fminf(float a, float b) #ifndef __KERNEL_GPU__ using std::isfinite; using std::isnan; +using std::sqrt; ccl_device_inline int abs(int x) { @@ -223,7 +224,7 @@ ccl_device_inline bool isfinite_safe(float f) { /* By IEEE 754 rule, 2*Inf equals Inf */ unsigned int x = __float_as_uint(f); - return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u); + return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f*f)) && !((x << 1) > 0xff000000u); } ccl_device_inline float ensure_finite(float v) @@ -329,15 +330,22 @@ template<class A, class B> A lerp(const A& a, const A& b, const B& t) return (A)(a * ((B)1 - t) + b * t); } +#endif /* __KERNEL_OPENCL__ */ + /* Triangle */ +#ifndef __KERNEL_OPENCL__ ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const float3& v3) +#else +ccl_device_inline float triangle_area(const float3 v1, + const float3 v2, + const float3 v3) +#endif { return len(cross(v3 - v2, v1 - v2))*0.5f; } -#endif /* __KERNEL_OPENCL__ */ /* Orthonormal vectors */ diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h index bb04c4aa2d9..e73e5bc17a2 100644 --- a/intern/cycles/util/util_math_float3.h +++ b/intern/cycles/util/util_math_float3.h @@ -108,8 +108,7 @@ ccl_device_inline float3 operator*(const float3& a, const float f) ccl_device_inline float3 operator*(const float f, const float3& a) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 +#if defined(__KERNEL_SSE__) return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128)); #else return make_float3(a.x*f, a.y*f, a.z*f); @@ -118,10 +117,8 @@ ccl_device_inline float3 operator*(const float f, const float3& a) ccl_device_inline float3 operator/(const float f, const float3& a) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - __m128 rc = _mm_rcp_ps(a.m128); - return float3(_mm_mul_ps(_mm_set1_ps(f),rc)); +#if defined(__KERNEL_SSE__) + return float3(_mm_div_ps(_mm_set1_ps(f), a.m128)); #else return make_float3(f / a.x, f / a.y, f / a.z); #endif @@ -135,10 +132,8 @@ ccl_device_inline float3 operator/(const float3& a, const float f) ccl_device_inline float3 operator/(const float3& a, const float3& b) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - __m128 rc = _mm_rcp_ps(b.m128); - return float3(_mm_mul_ps(a, rc)); +#if defined(__KERNEL_SSE__) + return float3(_mm_div_ps(a.m128, b.m128)); #else return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); #endif @@ -282,9 +277,8 @@ ccl_device_inline float3 mix(const float3& a, const float3& b, float t) ccl_device_inline float3 rcp(const float3& a) { #ifdef __KERNEL_SSE__ - const float4 r(_mm_rcp_ps(a.m128)); - return float3(_mm_sub_ps(_mm_add_ps(r, r), - _mm_mul_ps(_mm_mul_ps(r, r), a))); + /* Don't use _mm_rcp_ps due to poor precision. */ + return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); #else return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z); #endif diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h index d89121b3a1d..aa7e56fefe9 100644 --- a/intern/cycles/util/util_math_float4.h +++ b/intern/cycles/util/util_math_float4.h @@ -48,23 +48,30 @@ ccl_device_inline bool operator==(const float4& a, const float4& b); ccl_device_inline float dot(const float4& a, const float4& b); ccl_device_inline float len_squared(const float4& a); ccl_device_inline float4 rcp(const float4& a); +ccl_device_inline float4 sqrt(const float4& a); +ccl_device_inline float4 sqr(const float4& a); ccl_device_inline float4 cross(const float4& a, const float4& b); ccl_device_inline bool is_zero(const float4& a); -ccl_device_inline float reduce_add(const float4& a); ccl_device_inline float average(const float4& a); ccl_device_inline float len(const float4& a); ccl_device_inline float4 normalize(const float4& a); ccl_device_inline float4 safe_normalize(const float4& a); ccl_device_inline float4 min(const float4& a, const float4& b); ccl_device_inline float4 max(const float4& a, const float4& b); +ccl_device_inline float4 fabs(const float4& a); #endif /* !__KERNEL_OPENCL__*/ #ifdef __KERNEL_SSE__ template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4& b); +template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> +__forceinline const float4 shuffle(const float4& a, const float4& b); template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b); +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b); +template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b); + # ifdef __KERNEL_SSE3__ template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b); template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b); @@ -77,9 +84,7 @@ ccl_device_inline float4 select(const int4& mask, const float4& b); ccl_device_inline float4 reduce_min(const float4& a); ccl_device_inline float4 reduce_max(const float4& a); -# if 0 ccl_device_inline float4 reduce_add(const float4& a); -# endif #endif /* !__KERNEL_GPU__ */ /******************************************************************************* @@ -128,7 +133,7 @@ ccl_device_inline float4 operator/(const float4& a, float f) ccl_device_inline float4 operator/(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return a * rcp(b); + return float4(_mm_div_ps(a.m128, b.m128)); #else return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); #endif @@ -171,8 +176,7 @@ ccl_device_inline float4 operator/=(float4& a, float f) ccl_device_inline int4 operator<(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - /* TODO(sergey): avoid cvt. */ - return int4(_mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128))); + return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128))); #else return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); #endif @@ -181,8 +185,7 @@ ccl_device_inline int4 operator<(const float4& a, const float4& b) ccl_device_inline int4 operator>=(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - /* TODO(sergey): avoid cvt. */ - return int4(_mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128))); + return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128))); #else return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); #endif @@ -191,8 +194,7 @@ ccl_device_inline int4 operator>=(const float4& a, const float4& b) ccl_device_inline int4 operator<=(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - /* TODO(sergey): avoid cvt. */ - return int4(_mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128))); + return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128))); #else return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w); #endif @@ -224,14 +226,30 @@ ccl_device_inline float len_squared(const float4& a) ccl_device_inline float4 rcp(const float4& a) { #ifdef __KERNEL_SSE__ - float4 r(_mm_rcp_ps(a.m128)); - return float4(_mm_sub_ps(_mm_add_ps(r, r), - _mm_mul_ps(_mm_mul_ps(r, r), a))); + /* Don't use _mm_rcp_ps due to poor precision. */ + return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); #else return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w); #endif } +ccl_device_inline float4 sqrt(const float4& a) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_sqrt_ps(a.m128)); +#else + return make_float4(sqrtf(a.x), + sqrtf(a.y), + sqrtf(a.z), + sqrtf(a.w)); +#endif +} + +ccl_device_inline float4 sqr(const float4& a) +{ + return a * a; +} + ccl_device_inline float4 cross(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ @@ -254,20 +272,25 @@ ccl_device_inline bool is_zero(const float4& a) #endif } -ccl_device_inline float reduce_add(const float4& a) +ccl_device_inline float4 reduce_add(const float4& a) { #ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE3__ + float4 h(_mm_hadd_ps(a.m128, a.m128)); + return float4( _mm_hadd_ps(h.m128, h.m128)); +# else float4 h(shuffle<1,0,3,2>(a) + a); - /* TODO(sergey): Investigate efficiency. */ - return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); + return shuffle<2,3,0,1>(h) + h; +# endif #else - return ((a.x + a.y) + (a.z + a.w)); + float sum = (a.x + a.y) + (a.z + a.w); + return make_float4(sum, sum, sum, sum); #endif } ccl_device_inline float average(const float4& a) { - return reduce_add(a) * 0.25f; + return reduce_add(a).x * 0.25f; } ccl_device_inline float len(const float4& a) @@ -309,6 +332,18 @@ ccl_device_inline float4 max(const float4& a, const float4& b) max(a.w, b.w)); #endif } + +ccl_device_inline float4 fabs(const float4& a) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); +#else + return make_float4(fabsf(a.x), + fabsf(a.y), + fabsf(a.z), + fabsf(a.w)); +#endif +} #endif /* !__KERNEL_OPENCL__*/ #ifdef __KERNEL_SSE__ @@ -320,11 +355,28 @@ __forceinline const float4 shuffle(const float4& b) _MM_SHUFFLE(index_3, index_2, index_1, index_0)))); } +template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> +__forceinline const float4 shuffle(const float4& a, const float4& b) +{ + return float4(_mm_shuffle_ps(a.m128, b.m128, + _MM_SHUFFLE(index_3, index_2, index_1, index_0))); +} + template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b) { return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)))); } +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b) +{ + return float4(_mm_movelh_ps(a.m128, b.m128)); +} + +template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b) +{ + return float4(_mm_movehl_ps(b.m128, a.m128)); +} + # ifdef __KERNEL_SSE3__ template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b) { @@ -344,9 +396,7 @@ ccl_device_inline float4 select(const int4& mask, const float4& b) { #ifdef __KERNEL_SSE__ - /* TODO(sergey): avoid cvt. */ - return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), - _mm_andnot_ps(_mm_cvtepi32_ps(mask), b))); + return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128))); #else return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, @@ -355,6 +405,13 @@ ccl_device_inline float4 select(const int4& mask, #endif } +ccl_device_inline float4 mask(const int4& mask, + const float4& a) +{ + /* Replace elements of x with zero where mask isn't set. */ + return select(mask, a, make_float4(0.0f)); +} + ccl_device_inline float4 reduce_min(const float4& a) { #ifdef __KERNEL_SSE__ @@ -375,17 +432,15 @@ ccl_device_inline float4 reduce_max(const float4& a) #endif } -#if 0 -ccl_device_inline float4 reduce_add(const float4& a) +ccl_device_inline float4 load_float4(const float *v) { #ifdef __KERNEL_SSE__ - float4 h = shuffle<1,0,3,2>(a) + a; - return shuffle<2,3,0,1>(h) + h; + return float4(_mm_loadu_ps(v)); #else - return make_float4((a.x + a.y) + (a.z + a.w)); + return make_float4(v[0], v[1], v[2], v[3]); #endif } -#endif + #endif /* !__KERNEL_GPU__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h index c7511f8306e..b31dbe4fc67 100644 --- a/intern/cycles/util/util_math_matrix.h +++ b/intern/cycles/util/util_math_matrix.h @@ -223,20 +223,20 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float { const float singular_epsilon = 1e-9f; - for (int row = 0; row < n; row++) { - for (int col = 0; col < n; col++) { + for(int row = 0; row < n; row++) { + for(int col = 0; col < n; col++) { MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f; } } - for (int sweep = 0; sweep < 8; sweep++) { + for(int sweep = 0; sweep < 8; sweep++) { float off_diagonal = 0.0f; - for (int row = 1; row < n; row++) { - for (int col = 0; col < row; col++) { + for(int row = 1; row < n; row++) { + for(int col = 0; col < row; col++) { off_diagonal += fabsf(MAT(A, n, row, col)); } } - if (off_diagonal < 1e-7f) { + if(off_diagonal < 1e-7f) { /* The matrix has nearly reached diagonal form. * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */ break; @@ -253,7 +253,7 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float float abs_element = fabsf(element); /* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */ - if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) { + if(sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) { MAT(A, n, row, col) = 0.0f; continue; } @@ -272,10 +272,10 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float * Then, we compute sin(phi) and cos(phi) themselves. */ float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col); float ratio; - if (abs_element > singular_epsilon*fabsf(singular_diff)) { + if(abs_element > singular_epsilon*fabsf(singular_diff)) { float cot_2phi = 0.5f*singular_diff / element; ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi)); - if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */ + if(cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */ } else { ratio = element / singular_diff; @@ -315,21 +315,21 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float } /* Sort eigenvalues and the associated eigenvectors. */ - for (int i = 0; i < n - 1; i++) { + for(int i = 0; i < n - 1; i++) { float v = MAT(A, n, i, i); int k = i; - for (int j = i; j < n; j++) { - if (MAT(A, n, j, j) >= v) { + for(int j = i; j < n; j++) { + if(MAT(A, n, j, j) >= v) { v = MAT(A, n, j, j); k = j; } } - if (k != i) { + if(k != i) { /* Swap eigenvalues. */ MAT(A, n, k, k) = MAT(A, n, i, i); MAT(A, n, i, i) = v; /* Swap eigenvectors. */ - for (int j = 0; j < n; j++) { + for(int j = 0; j < n; j++) { float v = MATS(V, n, i, j, v_stride); MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride); MATS(V, n, k, j, v_stride) = v; @@ -339,59 +339,59 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float } #ifdef __KERNEL_SSE3__ -ccl_device_inline void math_vector_zero_sse(__m128 *A, int n) +ccl_device_inline void math_vector_zero_sse(float4 *A, int n) { for(int i = 0; i < n; i++) { - A[i] = _mm_setzero_ps(); + A[i] = make_float4(0.0f); } } -ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n) +ccl_device_inline void math_matrix_zero_sse(float4 *A, int n) { for(int row = 0; row < n; row++) { for(int col = 0; col <= row; col++) { - MAT(A, n, row, col) = _mm_setzero_ps(); + MAT(A, n, row, col) = make_float4(0.0f); } } } /* Add Gramian matrix of v to A. * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */ -ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight) +ccl_device_inline void math_matrix_add_gramian_sse(float4 *A, int n, const float4 *ccl_restrict v, float4 weight) { for(int row = 0; row < n; row++) { for(int col = 0; col <= row; col++) { - MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight)); + MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight; } } } -ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a) +ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a) { for(int i = 0; i < n; i++) { - V[i] = _mm_add_ps(V[i], a[i]); + V[i] += a[i]; } } -ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a) +ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a) { for(int i = 0; i < n; i++) { - V[i] = _mm_mul_ps(V[i], a[i]); + V[i] *= a[i]; } } -ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n) +ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n) { for(int i = 0; i < n; i++) { - a[i] = _mm_max_ps(a[i], b[i]); + a[i] = max(a[i], b[i]); } } -ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B) +ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B) { for(int row = 0; row < n; row++) { for(int col = 0; col <= row; col++) { - MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col)); + MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0]; } } } diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h index 6f70a474fe7..3c5785c4807 100644 --- a/intern/cycles/util/util_optimization.h +++ b/intern/cycles/util/util_optimization.h @@ -19,22 +19,15 @@ #ifndef __KERNEL_GPU__ -/* quiet unused define warnings */ -#if defined(__KERNEL_SSE2__) || \ - defined(__KERNEL_SSE3__) || \ - defined(__KERNEL_SSSE3__) || \ - defined(__KERNEL_SSE41__) || \ - defined(__KERNEL_AVX__) || \ - defined(__KERNEL_AVX2__) - /* do nothing */ -#endif - /* x86 * * Compile a regular, SSE2 and SSE3 kernel. */ #if defined(i386) || defined(_M_IX86) +/* We require minimum SSE2 support on x86, so auto enable. */ +# define __KERNEL_SSE2__ + # ifdef WITH_KERNEL_SSE2 # define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 # endif @@ -73,48 +66,6 @@ #endif /* defined(__x86_64__) || defined(_M_X64) */ -/* SSE Experiment - * - * This is disabled code for an experiment to use SSE types globally for types - * such as float3 and float4. Currently this gives an overall slowdown. */ - -#if 0 -# define __KERNEL_SSE__ -# ifndef __KERNEL_SSE2__ -# define __KERNEL_SSE2__ -# endif -# ifndef __KERNEL_SSE3__ -# define __KERNEL_SSE3__ -# endif -# ifndef __KERNEL_SSSE3__ -# define __KERNEL_SSSE3__ -# endif -# ifndef __KERNEL_SSE4__ -# define __KERNEL_SSE4__ -# endif -#endif - -/* SSE Intrinsics includes - * - * We assume __KERNEL_SSEX__ flags to have been defined at this point */ - -/* SSE intrinsics headers */ -#ifndef FREE_WINDOWS64 - -#ifdef _MSC_VER -# include <intrin.h> -#elif (defined(__x86_64__) || defined(__i386__)) -# include <x86intrin.h> -#endif - -#else - -/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>. - * Since we can't avoid including <windows.h>, better only include that */ -#include "util/util_windows.h" - -#endif - #endif #endif /* __UTIL_OPTIMIZATION_H__ */ diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp index f9c3b4bb139..bae5d5bd6d1 100644 --- a/intern/cycles/util/util_path.cpp +++ b/intern/cycles/util/util_path.cpp @@ -45,6 +45,7 @@ OIIO_NAMESPACE_USING # include <shlwapi.h> #endif +#include "util/util_map.h" #include "util/util_windows.h" CCL_NAMESPACE_BEGIN @@ -768,68 +769,180 @@ bool path_remove(const string& path) return remove(path.c_str()) == 0; } -static string line_directive(const string& base, const string& path, int line) +struct SourceReplaceState { + typedef map<string, string> ProcessedMapping; + /* Base director for all relative include headers. */ + string base; + /* Result of processed files. */ + ProcessedMapping processed_files; + /* Set of files which are considered "precompiled" and which are replaced + * with and empty string on a subsequent occurrence in include statement. + */ + set<string> precompiled_headers; +}; + +static string path_source_replace_includes_recursive( + const string& source, + const string& source_filepath, + SourceReplaceState *state); + +static string line_directive(const SourceReplaceState& state, + const string& path, + const int line) { - string escaped_path = path; + string unescaped_path = path; /* First we make path relative. */ - if(string_startswith(escaped_path, base.c_str())) { - const string base_file = path_filename(base); - const size_t base_len = base.length(); - escaped_path = base_file + escaped_path.substr(base_len, - escaped_path.length() - base_len); + if(string_startswith(unescaped_path, state.base.c_str())) { + const string base_file = path_filename(state.base); + const size_t base_len = state.base.length(); + unescaped_path = base_file + + unescaped_path.substr(base_len, + unescaped_path.length() - base_len); } /* Second, we replace all unsafe characters. */ - string_replace(escaped_path, "\"", "\\\""); - string_replace(escaped_path, "\'", "\\\'"); - string_replace(escaped_path, "\?", "\\\?"); - string_replace(escaped_path, "\\", "\\\\"); + const size_t length = unescaped_path.length(); + string escaped_path = ""; + for(size_t i = 0; i < length; ++i) { + const char ch = unescaped_path[i]; + if(strchr("\"\'\?\\", ch) != NULL) { + escaped_path += "\\"; + } + escaped_path += ch; + } + /* TODO(sergey): Check whether using std::to_string combined with several + * concatenation operations is any faster. + */ return string_printf("#line %d \"%s\"", line, escaped_path.c_str()); } +static string path_source_handle_preprocessor( + const string& preprocessor_line, + const string& source_filepath, + const size_t line_number, + SourceReplaceState *state) +{ + string result = preprocessor_line; + string token = string_strip( + preprocessor_line.substr(1, preprocessor_line.size() - 1)); + if(string_startswith(token, "include")) { + token = string_strip(token.substr(7, token.size() - 7)); + if(token[0] == '"') { + const size_t n_start = 1; + const size_t n_end = token.find("\"", n_start); + const string filename = token.substr(n_start, n_end - n_start); + const bool is_precompiled = string_endswith(token, "// PRECOMPILED"); + string filepath = path_join(state->base, filename); + if(!path_exists(filepath)) { + filepath = path_join(path_dirname(source_filepath), + filename); + } + if(is_precompiled) { + state->precompiled_headers.insert(filepath); + } + string text; + if(path_read_text(filepath, text)) { + text = path_source_replace_includes_recursive( + text, filepath, state); + /* Use line directives for better error messages. */ + result = line_directive(*state, filepath, 1) + "\n" + + text + "\n" + + line_directive(*state, source_filepath, line_number + 1); + } + } + } + return result; +} + +/* Our own little c preprocessor that replaces #includes with the file + * contents, to work around issue of OpenCL drivers not supporting + * include paths with spaces in them. + */ static string path_source_replace_includes_recursive( - const string& base, const string& source, - const string& source_filepath) + const string& source_filepath, + SourceReplaceState *state) { - /* Our own little c preprocessor that replaces #includes with the file - * contents, to work around issue of OpenCL drivers not supporting - * include paths with spaces in them. + /* Try to re-use processed file without spending time on replacing all + * include directives again. */ - + SourceReplaceState::ProcessedMapping::iterator replaced_file = + state->processed_files.find(source_filepath); + if(replaced_file != state->processed_files.end()) { + if(state->precompiled_headers.find(source_filepath) != + state->precompiled_headers.end()) { + return ""; + } + return replaced_file->second; + } + /* Perform full file processing. */ string result = ""; - vector<string> lines; - string_split(lines, source, "\n", false); - - for(size_t i = 0; i < lines.size(); ++i) { - string line = lines[i]; - if(line[0] == '#') { - string token = string_strip(line.substr(1, line.size() - 1)); - if(string_startswith(token, "include")) { - token = string_strip(token.substr(7, token.size() - 7)); - if(token[0] == '"') { - const size_t n_start = 1; - const size_t n_end = token.find("\"", n_start); - const string filename = token.substr(n_start, n_end - n_start); - string filepath = path_join(base, filename); - if(!path_exists(filepath)) { - filepath = path_join(path_dirname(source_filepath), - filename); - } - string text; - if(path_read_text(filepath, text)) { - text = path_source_replace_includes_recursive( - base, text, filepath); - /* Use line directives for better error messages. */ - line = line_directive(base, filepath, 1) - + token.replace(0, n_end + 1, "\n" + text + "\n") - + line_directive(base, source_filepath, i + 1); - } - } + const size_t source_length = source.length(); + size_t index = 0; + /* Information about where we are in the source. */ + size_t line_number = 0, column_number = 1; + /* Currently gathered non-preprocessor token. + * Store as start/length rather than token itself to avoid overhead of + * memory re-allocations on each character concatenation. + */ + size_t token_start = 0, token_length = 0; + /* Denotes whether we're inside of preprocessor line, together with + * preprocessor line itself. + * + * TODO(sergey): Investigate whether using token start/end position + * gives measurable speedup. + */ + bool inside_preprocessor = false; + string preprocessor_line = ""; + /* Actual loop over the whole source. */ + while(index < source_length) { + const char ch = source[index]; + if(ch == '\n') { + if(inside_preprocessor) { + result += path_source_handle_preprocessor(preprocessor_line, + source_filepath, + line_number, + state); + /* Start gathering net part of the token. */ + token_start = index; + token_length = 0; + } + inside_preprocessor = false; + preprocessor_line = ""; + column_number = 0; + ++line_number; + } + else if(ch == '#' && column_number == 1 && !inside_preprocessor) { + /* Append all possible non-preprocessor token to the result. */ + if(token_length != 0) { + result.append(source, token_start, token_length); + token_start = index; + token_length = 0; } + inside_preprocessor = true; + } + if(inside_preprocessor) { + preprocessor_line += ch; + } + else { + ++token_length; } - result += line + "\n"; + ++index; + ++column_number; } - + /* Append possible tokens which happened before special events handled + * above. + */ + if(token_length != 0) { + result.append(source, token_start, token_length); + } + if(inside_preprocessor) { + result += path_source_handle_preprocessor(preprocessor_line, + source_filepath, + line_number, + state); + } + /* Store result for further reuse. */ + state->processed_files[source_filepath] = result; return result; } @@ -837,10 +950,12 @@ string path_source_replace_includes(const string& source, const string& path, const string& source_filename) { + SourceReplaceState state; + state.base = path; return path_source_replace_includes_recursive( - path, source, - path_join(path, source_filename)); + path_join(path, source_filename), + &state); } FILE *path_fopen(const string& path, const string& mode) diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h index cd4fe52fdc9..134383e88db 100644 --- a/intern/cycles/util/util_progress.h +++ b/intern/cycles/util/util_progress.h @@ -41,6 +41,7 @@ public: denoised_tiles = 0; start_time = time_dt(); render_start_time = time_dt(); + end_time = 0.0; status = "Initializing"; substatus = ""; sync_status = ""; @@ -80,6 +81,7 @@ public: denoised_tiles = 0; start_time = time_dt(); render_start_time = time_dt(); + end_time = 0.0; status = "Initializing"; substatus = ""; sync_status = ""; @@ -146,6 +148,7 @@ public: thread_scoped_lock lock(progress_mutex); start_time = time_dt(); + end_time = 0.0; } void set_render_start_time() @@ -169,8 +172,15 @@ public: { thread_scoped_lock lock(progress_mutex); - total_time_ = time_dt() - start_time; - render_time_ = time_dt() - render_start_time; + double time = (end_time > 0) ? end_time : time_dt(); + + total_time_ = time - start_time; + render_time_ = time - render_start_time; + } + + void set_end_time() + { + end_time = time_dt(); } void reset_sample() @@ -337,6 +347,8 @@ protected: int rendered_tiles, denoised_tiles; double start_time, render_start_time; + /* End time written when render is done, so it doesn't keep increasing on redraws. */ + double end_time; string status; string substatus; diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index 587febe3e52..58b3d267266 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -18,19 +18,38 @@ #ifndef __UTIL_SIMD_TYPES_H__ #define __UTIL_SIMD_TYPES_H__ +#ifndef __KERNEL_GPU__ + #include <limits> #include "util/util_debug.h" -#include "util/util_types.h" +#include "util/util_defines.h" + +/* SSE Intrinsics includes + * + * We assume __KERNEL_SSEX__ flags to have been defined at this point */ + +/* SSE intrinsics headers */ +#ifndef FREE_WINDOWS64 + +#ifdef _MSC_VER +# include <intrin.h> +#elif (defined(__x86_64__) || defined(__i386__)) +# include <x86intrin.h> +#endif + +#else + +/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>. + * Since we can't avoid including <windows.h>, better only include that */ +#include "util/util_windows.h" + +#endif CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ -struct sseb; -struct ssei; -struct ssef; - extern const __m128 _mm_lookupmask_ps[16]; /* Special Types */ @@ -328,12 +347,12 @@ __forceinline size_t __bscf(size_t& v) #endif /* _WIN32 */ -static const unsigned int BITSCAN_NO_BIT_SET_32 = 32; -static const size_t BITSCAN_NO_BIT_SET_64 = 64; +/* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test + * __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other + * platforms when compiling code outside the kernel. */ +#if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) -#ifdef __KERNEL_SSE3__ -/* Emulation of SSE4 functions with SSE3 */ -# ifndef __KERNEL_SSE41__ +/* Emulation of SSE4 functions with SSE2 */ #define _MM_FROUND_TO_NEAREST_INT 0x00 #define _MM_FROUND_TO_NEG_INF 0x01 @@ -342,50 +361,50 @@ static const size_t BITSCAN_NO_BIT_SET_64 = 64; #define _MM_FROUND_CUR_DIRECTION 0x04 #undef _mm_blendv_ps -#define _mm_blendv_ps __emu_mm_blendv_ps -__forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) { - return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value)); +#define _mm_blendv_ps _mm_blendv_ps_emu +__forceinline __m128 _mm_blendv_ps_emu( __m128 value, __m128 input, __m128 mask) +{ + __m128i isignmask = _mm_set1_epi32(0x80000000); + __m128 signmask = _mm_castsi128_ps(isignmask); + __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask)); + __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask); + __m128 cmpmask = _mm_castsi128_ps(icmpmask); + return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value)); } #undef _mm_blend_ps -#define _mm_blend_ps __emu_mm_blend_ps -__forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) { +#define _mm_blend_ps _mm_blend_ps_emu +__forceinline __m128 _mm_blend_ps_emu( __m128 value, __m128 input, const int mask) +{ assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); } #undef _mm_blendv_epi8 -#define _mm_blendv_epi8 __emu_mm_blendv_epi8 -__forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) { +#define _mm_blendv_epi8 _mm_blendv_epi8_emu +__forceinline __m128i _mm_blendv_epi8_emu( __m128i value, __m128i input, __m128i mask) +{ return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); } -#undef _mm_mullo_epi32 -#define _mm_mullo_epi32 __emu_mm_mullo_epi32 -__forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) { - __m128i rvalue; - char* _r = (char*)(&rvalue + 1); - char* _v = (char*)(& value + 1); - char* _i = (char*)(& input + 1); - for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))* *((int32_t*)(_i + i)); - return rvalue; -} - #undef _mm_min_epi32 -#define _mm_min_epi32 __emu_mm_min_epi32 -__forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) { +#define _mm_min_epi32 _mm_min_epi32_emu +__forceinline __m128i _mm_min_epi32_emu( __m128i value, __m128i input) +{ return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); } #undef _mm_max_epi32 -#define _mm_max_epi32 __emu_mm_max_epi32 -__forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) { +#define _mm_max_epi32 _mm_max_epi32_emu +__forceinline __m128i _mm_max_epi32_emu( __m128i value, __m128i input) +{ return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); } #undef _mm_extract_epi32 -#define _mm_extract_epi32 __emu_mm_extract_epi32 -__forceinline int _mm_extract_epi32( __m128i input, const int index ) { - switch ( index ) { +#define _mm_extract_epi32 _mm_extract_epi32_emu +__forceinline int _mm_extract_epi32_emu( __m128i input, const int index) +{ + switch(index) { case 0: return _mm_cvtsi128_si32(input); case 1: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1))); case 2: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2))); @@ -395,27 +414,26 @@ __forceinline int _mm_extract_epi32( __m128i input, const int index ) { } #undef _mm_insert_epi32 -#define _mm_insert_epi32 __emu_mm_insert_epi32 -__forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) { +#define _mm_insert_epi32 _mm_insert_epi32_emu +__forceinline __m128i _mm_insert_epi32_emu( __m128i value, int input, const int index) +{ assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value; } -#undef _mm_extract_ps -#define _mm_extract_ps __emu_mm_extract_ps -__forceinline int _mm_extract_ps( __m128 input, const int index ) { - int32_t* ptr = (int32_t*)&input; return ptr[index]; -} - #undef _mm_insert_ps -#define _mm_insert_ps __emu_mm_insert_ps -__forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index ) -{ assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); } +#define _mm_insert_ps _mm_insert_ps_emu +__forceinline __m128 _mm_insert_ps_emu( __m128 value, __m128 input, const int index) +{ + assert(index < 0x100); + ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; + return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); +} #undef _mm_round_ps -#define _mm_round_ps __emu_mm_round_ps -__forceinline __m128 _mm_round_ps( __m128 value, const int flags ) +#define _mm_round_ps _mm_round_ps_emu +__forceinline __m128 _mm_round_ps_emu( __m128 value, const int flags) { - switch ( flags ) + switch(flags) { case _MM_FROUND_TO_NEAREST_INT: return _mm_cvtepi32_ps(_mm_cvtps_epi32(value)); case _MM_FROUND_TO_NEG_INF : return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f)))); @@ -425,57 +443,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags ) return value; } -# ifdef _M_X64 -#undef _mm_insert_epi64 -#define _mm_insert_epi64 __emu_mm_insert_epi64 -__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { - assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; -} - -#undef _mm_extract_epi64 -#define _mm_extract_epi64 __emu_mm_extract_epi64 -__forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { - assert(size_t(index) < 2); - return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); -} -# endif - -# endif - -#undef _mm_fabs_ps -#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))) - -/* Return a __m128 with every element set to the largest element of v. */ -ccl_device_inline __m128 _mm_hmax_ps(__m128 v) -{ - /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */ - v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v)); - /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */ - v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v)); - return v; -} - -/* Return the sum of the four elements of x. */ -ccl_device_inline float _mm_hsum_ss(__m128 x) -{ - __m128 a = _mm_movehdup_ps(x); - __m128 b = _mm_add_ps(x, a); - return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b)); -} - -/* Return a __m128 with every element set to the sum of the four elements of x. */ -ccl_device_inline __m128 _mm_hsum_ps(__m128 x) -{ - x = _mm_hadd_ps(x, x); - x = _mm_hadd_ps(x, x); - return x; -} - -/* Replace elements of x with zero where mask isn't set. */ -#undef _mm_mask_ps -#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask) - -#endif +#endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */ #else /* __KERNEL_SSE2__ */ @@ -496,13 +464,19 @@ ccl_device_inline int bitscan(int value) #endif /* __KERNEL_SSE2__ */ +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) || \ + defined(__KERNEL_SSE3__) || \ + defined(__KERNEL_SSSE3__) || \ + defined(__KERNEL_SSE41__) || \ + defined(__KERNEL_AVX__) || \ + defined(__KERNEL_AVX2__) + /* do nothing */ +#endif + CCL_NAMESPACE_END -#include "util/util_math.h" -#include "util/util_sseb.h" -#include "util/util_ssei.h" -#include "util/util_ssef.h" -#include "util/util_avxf.h" +#endif /* __KERNEL_GPU__ */ #endif /* __UTIL_SIMD_TYPES_H__ */ diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h index 6e669701f3b..93c22aafdcd 100644 --- a/intern/cycles/util/util_sseb.h +++ b/intern/cycles/util/util_sseb.h @@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ +struct ssei; +struct ssef; + /*! 4-wide SSE bool type. */ struct sseb { diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h index cf99a08efae..bb007ff84a9 100644 --- a/intern/cycles/util/util_ssef.h +++ b/intern/cycles/util/util_ssef.h @@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ +struct sseb; +struct ssef; + /*! 4-wide SSE float type. */ struct ssef { diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h index 5f62569268c..ef2a9e68b7d 100644 --- a/intern/cycles/util/util_ssei.h +++ b/intern/cycles/util/util_ssei.h @@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ +struct sseb; +struct ssef; + /*! 4-wide SSE integer type. */ struct ssei { @@ -234,8 +237,10 @@ __forceinline size_t select_max(const sseb& valid, const ssei& v) { const ssei a #else -__forceinline int reduce_min(const ssei& v) { return min(min(v[0],v[1]),min(v[2],v[3])); } -__forceinline int reduce_max(const ssei& v) { return max(max(v[0],v[1]),max(v[2],v[3])); } +__forceinline int ssei_min(int a, int b) { return (a < b)? a: b; } +__forceinline int ssei_max(int a, int b) { return (a > b)? a: b; } +__forceinline int reduce_min(const ssei& v) { return ssei_min(ssei_min(v[0],v[1]),ssei_min(v[2],v[3])); } +__forceinline int reduce_max(const ssei& v) { return ssei_max(ssei_max(v[0],v[1]),ssei_max(v[2],v[3])); } __forceinline int reduce_add(const ssei& v) { return v[0]+v[1]+v[2]+v[3]; } #endif diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h index baba549753d..7667f58eb7d 100644 --- a/intern/cycles/util/util_stats.h +++ b/intern/cycles/util/util_stats.h @@ -30,7 +30,7 @@ public: void mem_alloc(size_t size) { atomic_add_and_fetch_z(&mem_used, size); - atomic_update_max_z(&mem_peak, mem_used); + atomic_fetch_and_update_max_z(&mem_peak, mem_used); } void mem_free(size_t size) { diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp index fb0c34e1dc4..6ed97b0e0a6 100644 --- a/intern/cycles/util/util_task.cpp +++ b/intern/cycles/util/util_task.cpp @@ -206,9 +206,9 @@ void TaskScheduler::init(int num_threads) threads.resize(num_threads); const int num_groups = system_cpu_group_count(); - unsigned short num_process_groups; + unsigned short num_process_groups = 0; vector<unsigned short> process_groups; - int current_group_threads; + int current_group_threads = 0; if(num_groups > 1) { process_groups.resize(num_groups); num_process_groups = system_cpu_process_groups(num_groups, diff --git a/intern/cycles/util/util_time.h b/intern/cycles/util/util_time.h index 65798244111..f03aa590e9b 100644 --- a/intern/cycles/util/util_time.h +++ b/intern/cycles/util/util_time.h @@ -37,7 +37,7 @@ public: ~scoped_timer() { if(value_ != NULL) { - *value_ = time_dt() - time_start_; + *value_ = get_time(); } } @@ -46,6 +46,11 @@ public: return time_start_; } + double get_time() const + { + return time_dt() - time_start_; + } + protected: double *value_; double time_start_; diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index a5d1d7152d5..aabca6c81fc 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -21,72 +21,18 @@ # include <stdlib.h> #endif -/* Bitness */ +/* Standard Integer Types */ -#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) -# define __KERNEL_64_BIT__ +#if !defined(__KERNEL_GPU__) && !defined(_WIN32) +# include <stdint.h> #endif -/* Qualifiers for kernel code shared by CPU and GPU */ - -#ifndef __KERNEL_GPU__ -# define ccl_device static inline -# define ccl_device_noinline static -# define ccl_global -# define ccl_constant -# define ccl_local -# define ccl_local_param -# define ccl_private -# define ccl_restrict __restrict -# define __KERNEL_WITH_SSE_ALIGN__ - -# if defined(_WIN32) && !defined(FREE_WINDOWS) -# define ccl_device_inline static __forceinline -# define ccl_device_forceinline static __forceinline -# define ccl_align(...) __declspec(align(__VA_ARGS__)) -# ifdef __KERNEL_64_BIT__ -# define ccl_try_align(...) __declspec(align(__VA_ARGS__)) -# else /* __KERNEL_64_BIT__ */ -# undef __KERNEL_WITH_SSE_ALIGN__ -/* No support for function arguments (error C2719). */ -# define ccl_try_align(...) -# endif /* __KERNEL_64_BIT__ */ -# define ccl_may_alias -# define ccl_always_inline __forceinline -# define ccl_never_inline __declspec(noinline) -# define ccl_maybe_unused -# else /* _WIN32 && !FREE_WINDOWS */ -# define ccl_device_inline static inline __attribute__((always_inline)) -# define ccl_device_forceinline static inline __attribute__((always_inline)) -# define ccl_align(...) __attribute__((aligned(__VA_ARGS__))) -# ifndef FREE_WINDOWS64 -# define __forceinline inline __attribute__((always_inline)) -# endif -# define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) -# define ccl_may_alias __attribute__((__may_alias__)) -# define ccl_always_inline __attribute__((always_inline)) -# define ccl_never_inline __attribute__((noinline)) -# define ccl_maybe_unused __attribute__((used)) -# endif /* _WIN32 && !FREE_WINDOWS */ - -/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */ -# if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */ -# define ATTR_FALLTHROUGH __attribute__((fallthrough)) -# else -# define ATTR_FALLTHROUGH ((void)0) -# endif -#endif /* __KERNEL_GPU__ */ - -/* Standard Integer Types */ +#include "util/util_defines.h" #ifndef __KERNEL_GPU__ -/* int8_t, uint16_t, and friends */ -# ifndef _WIN32 -# include <stdint.h> -# endif -/* SIMD Types */ # include "util/util_optimization.h" -#endif /* __KERNEL_GPU__ */ +# include "util/util_simd.h" +#endif CCL_NAMESPACE_BEGIN @@ -201,65 +147,8 @@ enum ExtensionType { EXTENSION_NUM_TYPES, }; -/* macros */ - -/* hints for branch prediction, only use in code that runs a _lot_ */ -#if defined(__GNUC__) && defined(__KERNEL_CPU__) -# define LIKELY(x) __builtin_expect(!!(x), 1) -# define UNLIKELY(x) __builtin_expect(!!(x), 0) -#else -# define LIKELY(x) (x) -# define UNLIKELY(x) (x) -#endif - -#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800)) -# define HAS_CPP11_FEATURES -#endif - -#if defined(__GNUC__) || defined(__clang__) -# if defined(HAS_CPP11_FEATURES) -/* Some magic to be sure we don't have reference in the type. */ -template<typename T> static inline T decltype_helper(T x) { return x; } -# define TYPEOF(x) decltype(decltype_helper(x)) -# else -# define TYPEOF(x) typeof(x) -# endif -#endif - -/* Causes warning: - * incompatible types when assigning to type 'Foo' from type 'Bar' - * ... the compiler optimizes away the temp var */ -#ifdef __GNUC__ -#define CHECK_TYPE(var, type) { \ - TYPEOF(var) *__tmp; \ - __tmp = (type *)NULL; \ - (void)__tmp; \ -} (void)0 - -#define CHECK_TYPE_PAIR(var_a, var_b) { \ - TYPEOF(var_a) *__tmp; \ - __tmp = (typeof(var_b) *)NULL; \ - (void)__tmp; \ -} (void)0 -#else -# define CHECK_TYPE(var, type) -# define CHECK_TYPE_PAIR(var_a, var_b) -#endif - -/* can be used in simple macros */ -#define CHECK_TYPE_INLINE(val, type) \ - ((void)(((type)0) != (val))) - - CCL_NAMESPACE_END -#ifndef __KERNEL_GPU__ -# include <cassert> -# define util_assert(statement) assert(statement) -#else -# define util_assert(statement) -#endif - /* Vectorized types declaration. */ #include "util/util_types_uchar2.h" #include "util/util_types_uchar3.h" @@ -298,5 +187,13 @@ CCL_NAMESPACE_END #include "util/util_types_vector3_impl.h" +/* SSE types. */ +#ifndef __KERNEL_GPU__ +# include "util/util_sseb.h" +# include "util/util_ssei.h" +# include "util/util_ssef.h" +# include "util/util_avxf.h" +#endif + #endif /* __UTIL_TYPES_H__ */ |