diff options
Diffstat (limited to 'intern/cycles')
322 files changed, 9826 insertions, 7279 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 79c1c3e3e82..806a8660e8c 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -22,6 +22,7 @@ if(WITH_CYCLES_NATIVE_ONLY) -DWITH_KERNEL_NATIVE ) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") + set(CYCLES_KERNEL_FLAGS "-march=native") elseif(NOT WITH_CPU_SSE) set(CXX_HAS_SSE FALSE) set(CXX_HAS_AVX FALSE) @@ -59,10 +60,13 @@ elseif(WIN32 AND MSVC) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Ox") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Ox") set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /Ox") + + set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") elseif(CMAKE_COMPILER_IS_GNUCC) check_cxx_compiler_flag(-msse CXX_HAS_SSE) check_cxx_compiler_flag(-mavx CXX_HAS_AVX) check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2) + set(CYCLES_KERNEL_FLAGS "-ffast-math") if(CXX_HAS_SSE) set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse") set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse") @@ -74,10 +78,12 @@ elseif(CMAKE_COMPILER_IS_GNUCC) if(CXX_HAS_AVX2) set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c -mfpmath=sse") endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only") elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") check_cxx_compiler_flag(-msse CXX_HAS_SSE) check_cxx_compiler_flag(-mavx CXX_HAS_AVX) check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2) + set(CYCLES_KERNEL_FLAGS "-ffast-math") if(CXX_HAS_SSE) set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2") set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3") @@ -89,6 +95,7 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") if(CXX_HAS_AVX2) set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c") endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only") endif() if(CXX_HAS_SSE) diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt index 8cd499b7ca6..aabb8f63640 100644 --- a/intern/cycles/app/CMakeLists.txt +++ b/intern/cycles/app/CMakeLists.txt @@ -1,14 +1,6 @@ set(INC - . - ../bvh - ../device - ../graph - ../kernel - ../kernel/svm - ../render - ../subd - ../util + .. ) set(INC_SYS ) diff --git a/intern/cycles/app/cycles_server.cpp b/intern/cycles/app/cycles_server.cpp index 4ef9cd070bb..e65b9d769e4 100644 --- a/intern/cycles/app/cycles_server.cpp +++ b/intern/cycles/app/cycles_server.cpp @@ -16,15 +16,15 @@ #include <stdio.h> -#include "device.h" - -#include "util_args.h" -#include "util_foreach.h" -#include "util_path.h" -#include "util_stats.h" -#include "util_string.h" -#include "util_task.h" -#include "util_logging.h" +#include "device/device.h" + +#include "util/util_args.h" +#include "util/util_foreach.h" +#include "util/util_path.h" +#include "util/util_stats.h" +#include "util/util_string.h" +#include "util/util_task.h" +#include "util/util_logging.h" using namespace ccl; diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp index 9816d614a7c..0cd249f0d84 100644 --- a/intern/cycles/app/cycles_standalone.cpp +++ b/intern/cycles/app/cycles_standalone.cpp @@ -16,29 +16,29 @@ #include <stdio.h> -#include "buffers.h" -#include "camera.h" -#include "device.h" -#include "scene.h" -#include "session.h" -#include "integrator.h" - -#include "util_args.h" -#include "util_foreach.h" -#include "util_function.h" -#include "util_logging.h" -#include "util_path.h" -#include "util_progress.h" -#include "util_string.h" -#include "util_time.h" -#include "util_transform.h" -#include "util_version.h" +#include "render/buffers.h" +#include "render/camera.h" +#include "device/device.h" +#include "render/scene.h" +#include "render/session.h" +#include "render/integrator.h" + +#include "util/util_args.h" +#include "util/util_foreach.h" +#include "util/util_function.h" +#include "util/util_logging.h" +#include "util/util_path.h" +#include "util/util_progress.h" +#include "util/util_string.h" +#include "util/util_time.h" +#include "util/util_transform.h" +#include "util/util_version.h" #ifdef WITH_CYCLES_STANDALONE_GUI -#include "util_view.h" +#include "util/util_view.h" #endif -#include "cycles_xml.h" +#include "app/cycles_xml.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp index 35a30ae683f..04f00ef0e10 100644 --- a/intern/cycles/app/cycles_xml.cpp +++ b/intern/cycles/app/cycles_xml.cpp @@ -20,31 +20,31 @@ #include <algorithm> #include <iterator> -#include "node_xml.h" - -#include "background.h" -#include "camera.h" -#include "film.h" -#include "graph.h" -#include "integrator.h" -#include "light.h" -#include "mesh.h" -#include "nodes.h" -#include "object.h" -#include "osl.h" -#include "shader.h" -#include "scene.h" - -#include "subd_patch.h" -#include "subd_split.h" - -#include "util_debug.h" -#include "util_foreach.h" -#include "util_path.h" -#include "util_transform.h" -#include "util_xml.h" - -#include "cycles_xml.h" +#include "graph/node_xml.h" + +#include "render/background.h" +#include "render/camera.h" +#include "render/film.h" +#include "render/graph.h" +#include "render/integrator.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/nodes.h" +#include "render/object.h" +#include "render/osl.h" +#include "render/shader.h" +#include "render/scene.h" + +#include "subd/subd_patch.h" +#include "subd/subd_split.h" + +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_path.h" +#include "util/util_transform.h" +#include "util/util_xml.h" + +#include "app/cycles_xml.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt index b57502b3b14..ae4977aaed0 100644 --- a/intern/cycles/blender/CMakeLists.txt +++ b/intern/cycles/blender/CMakeLists.txt @@ -1,12 +1,6 @@ set(INC - ../graph - ../render - ../device - ../kernel - ../kernel/svm - ../util - ../subd + .. ../../glew-mx ../../guardedalloc ../../mikktspace diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py index 235d19e91e8..eb792af7264 100644 --- a/intern/cycles/blender/addon/__init__.py +++ b/intern/cycles/blender/addon/__init__.py @@ -107,7 +107,13 @@ def engine_exit(): engine.exit() +classes = ( + CyclesRender, +) + + def register(): + from bpy.utils import register_class from . import ui from . import properties from . import presets @@ -122,12 +128,15 @@ def register(): properties.register() ui.register() presets.register() - bpy.utils.register_module(__name__) + + for cls in classes: + register_class(cls) bpy.app.handlers.version_update.append(version_update.do_versions) def unregister(): + from bpy.utils import unregister_class from . import ui from . import properties from . import presets @@ -138,4 +147,6 @@ def unregister(): ui.unregister() properties.unregister() presets.unregister() - bpy.utils.unregister_module(__name__) + + for cls in classes: + unregister_class(cls) diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index c8c9ef58c52..ab57dd44bdb 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -50,6 +50,24 @@ def _workaround_buggy_drivers(): _cycles.opencl_disable() +def _configure_argument_parser(): + import argparse + parser = argparse.ArgumentParser(description="Cycles Addon argument parser") + parser.add_argument("--cycles-resumable-num-chunks", + help="Number of chunks to split sample range into", + default=None) + parser.add_argument("--cycles-resumable-current-chunk", + help="Current chunk of samples range to render", + default=None) + parser.add_argument("--cycles-resumable-start-chunk", + help="Start chunk to render", + default=None) + parser.add_argument("--cycles-resumable-end-chunk", + help="End chunk to render", + default=None) + return parser + + def _parse_command_line(): import sys @@ -57,25 +75,22 @@ def _parse_command_line(): if "--" not in argv: return - argv = argv[argv.index("--") + 1:] - - num_resumable_chunks = None - current_resumable_chunk = None - - # TODO(sergey): Add some nice error prints if argument is not used properly. - idx = 0 - while idx < len(argv) - 1: - arg = argv[idx] - if arg == '--cycles-resumable-num-chunks': - num_resumable_chunks = int(argv[idx + 1]) - elif arg == '--cycles-resumable-current-chunk': - current_resumable_chunk = int(argv[idx + 1]) - idx += 1 + parser = _configure_argument_parser() + args, unknown = parser.parse_known_args(argv[argv.index("--") + 1:]) - if num_resumable_chunks is not None and current_resumable_chunk is not None: - import _cycles - _cycles.set_resumable_chunks(num_resumable_chunks, - current_resumable_chunk) + if args.cycles_resumable_num_chunks is not None: + if args.cycles_resumable_current_chunk is not None: + import _cycles + _cycles.set_resumable_chunk( + int(args.cycles_resumable_num_chunks), + int(args.cycles_resumable_current_chunk)) + elif args.cycles_resumable_start_chunk is not None and \ + args.cycles_resumable_end_chunk: + import _cycles + _cycles.set_resumable_chunk_range( + int(args.cycles_resumable_num_chunks), + int(args.cycles_resumable_start_chunk), + int(args.cycles_resumable_end_chunk)) def init(): diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py index f97b51b629d..82c4ffc6e50 100644 --- a/intern/cycles/blender/addon/presets.py +++ b/intern/cycles/blender/addon/presets.py @@ -82,12 +82,23 @@ class AddPresetSampling(AddPresetBase, Operator): preset_subdir = "cycles/sampling" +classes = ( + AddPresetIntegrator, + AddPresetSampling, +) + + def register(): - pass + from bpy.utils import register_class + for cls in classes: + register_class(cls) def unregister(): - pass + from bpy.utils import unregister_class + for cls in classes: + unregister_class(cls) + if __name__ == "__main__": register() diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 5c51f9afc28..cbf469b3a89 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -665,8 +665,10 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): cls.debug_use_cpu_sse3 = BoolProperty(name="SSE3", default=True) cls.debug_use_cpu_sse2 = BoolProperty(name="SSE2", default=True) cls.debug_use_qbvh = BoolProperty(name="QBVH", default=True) + cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False) cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False) + cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", default=False) cls.debug_opencl_kernel_type = EnumProperty( name="OpenCL Kernel Type", @@ -693,6 +695,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): update=devices_update_callback ) + cls.debug_opencl_kernel_single_program = BoolProperty(name="Single Program", default=False, update=devices_update_callback); + cls.debug_use_opencl_debug = BoolProperty(name="Debug OpenCL", default=False) @classmethod @@ -1092,6 +1096,12 @@ class CyclesObjectSettings(bpy.types.PropertyGroup): default=1.0, ) + cls.is_shadow_catcher = BoolProperty( + name="Shadow Catcher", + description="Only render shadows on this object, for compositing renders into real footage", + default=False, + ) + @classmethod def unregister(cls): del bpy.types.Object.cycles diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 44af5f7efed..2b50d272be8 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -86,12 +86,10 @@ def use_sample_all_lights(context): return cscene.sample_all_lights_direct or cscene.sample_all_lights_indirect -def show_device_selection(context): - type = get_device_type(context) - if type == 'NETWORK': +def show_device_active(context): + cscene = context.scene.cycles + if cscene.device != 'GPU': return True - if not type in {'CUDA', 'OPENCL'}: - return False return context.user_preferences.addons[__package__].preferences.has_active_device() @@ -186,9 +184,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): sub.label(text="AA Samples:") sub.prop(cscene, "aa_samples", text="Render") sub.prop(cscene, "preview_aa_samples", text="Preview") - sub.separator() - sub.prop(cscene, "sample_all_lights_direct") - sub.prop(cscene, "sample_all_lights_indirect") col = split.column() sub = col.column(align=True) @@ -205,6 +200,10 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): sub.prop(cscene, "subsurface_samples", text="Subsurface") sub.prop(cscene, "volume_samples", text="Volume") + col = layout.column(align=True) + col.prop(cscene, "sample_all_lights_direct") + col.prop(cscene, "sample_all_lights_indirect") + if not (use_opencl(context) and cscene.feature_set != 'EXPERIMENTAL'): layout.row().prop(cscene, "sampling_pattern", text="Pattern") @@ -270,7 +269,7 @@ class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel): row = col.row() row.prop(ccscene, "minimum_width", text="Min Pixels") - row.prop(ccscene, "maximum_width", text="Max Ext.") + row.prop(ccscene, "maximum_width", text="Max Extension") class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel): @@ -788,6 +787,8 @@ class CyclesObject_PT_cycles_settings(CyclesButtonsPanel, Panel): if ob.type != 'LAMP': flow.prop(visibility, "shadow") + layout.prop(cob, "is_shadow_catcher") + col = layout.column() col.label(text="Performance:") row = col.row() @@ -1518,15 +1519,18 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel): row.prop(cscene, "debug_use_cpu_avx", toggle=True) row.prop(cscene, "debug_use_cpu_avx2", toggle=True) col.prop(cscene, "debug_use_qbvh") + col.prop(cscene, "debug_use_cpu_split_kernel") col = layout.column() col.label('CUDA Flags:') col.prop(cscene, "debug_use_cuda_adaptive_compile") + col.prop(cscene, "debug_use_cuda_split_kernel") col = layout.column() col.label('OpenCL Flags:') col.prop(cscene, "debug_opencl_kernel_type", text="Kernel") col.prop(cscene, "debug_opencl_device_type", text="Device") + col.prop(cscene, "debug_opencl_kernel_single_program", text="Single Program") col.prop(cscene, "debug_use_opencl_debug", text="Debug") @@ -1633,7 +1637,7 @@ def draw_device(self, context): split = layout.split(percentage=1/3) split.label("Device:") row = split.row() - row.active = show_device_selection(context) + row.active = show_device_active(context) row.prop(cscene, "device", text="") if engine.with_osl() and use_cpu(context): @@ -1712,17 +1716,75 @@ def get_panels(): return panels + +classes = ( + CYCLES_MT_sampling_presets, + CYCLES_MT_integrator_presets, + CyclesRender_PT_sampling, + CyclesRender_PT_geometry, + CyclesRender_PT_light_paths, + CyclesRender_PT_motion_blur, + CyclesRender_PT_film, + CyclesRender_PT_performance, + CyclesRender_PT_layer_options, + CyclesRender_PT_layer_passes, + CyclesRender_PT_views, + Cycles_PT_post_processing, + CyclesCamera_PT_dof, + Cycles_PT_context_material, + CyclesObject_PT_motion_blur, + CyclesObject_PT_cycles_settings, + CYCLES_OT_use_shading_nodes, + CyclesLamp_PT_preview, + CyclesLamp_PT_lamp, + CyclesLamp_PT_nodes, + CyclesLamp_PT_spot, + CyclesWorld_PT_preview, + CyclesWorld_PT_surface, + CyclesWorld_PT_volume, + CyclesWorld_PT_ambient_occlusion, + CyclesWorld_PT_mist, + CyclesWorld_PT_ray_visibility, + CyclesWorld_PT_settings, + CyclesMaterial_PT_preview, + CyclesMaterial_PT_surface, + CyclesMaterial_PT_volume, + CyclesMaterial_PT_displacement, + CyclesMaterial_PT_settings, + CyclesTexture_PT_context, + CyclesTexture_PT_node, + CyclesTexture_PT_mapping, + CyclesTexture_PT_colors, + CyclesParticle_PT_textures, + CyclesRender_PT_bake, + CyclesRender_PT_debug, + CyclesParticle_PT_CurveSettings, + CyclesScene_PT_simplify, +) + + def register(): + from bpy.utils import register_class + bpy.types.RENDER_PT_render.append(draw_device) bpy.types.VIEW3D_HT_header.append(draw_pause) for panel in get_panels(): panel.COMPAT_ENGINES.add('CYCLES') + for cls in classes: + register_class(cls) + + def unregister(): + from bpy.utils import unregister_class + bpy.types.RENDER_PT_render.remove(draw_device) bpy.types.VIEW3D_HT_header.remove(draw_pause) for panel in get_panels(): if 'CYCLES' in panel.COMPAT_ENGINES: panel.COMPAT_ENGINES.remove('CYCLES') + + for cls in classes: + unregister_class(cls) diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp index f02fc553908..40d6b25f2b7 100644 --- a/intern/cycles/blender/blender_camera.cpp +++ b/intern/cycles/blender/blender_camera.cpp @@ -14,13 +14,13 @@ * limitations under the License. */ -#include "camera.h" -#include "scene.h" +#include "render/camera.h" +#include "render/scene.h" -#include "blender_sync.h" -#include "blender_util.h" +#include "blender/blender_sync.h" +#include "blender/blender_util.h" -#include "util_logging.h" +#include "util/util_logging.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp index e42ff5d72a6..6fa038e8bf0 100644 --- a/intern/cycles/blender/blender_curves.cpp +++ b/intern/cycles/blender/blender_curves.cpp @@ -14,18 +14,18 @@ * limitations under the License. */ -#include "attribute.h" -#include "camera.h" -#include "curves.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" +#include "render/attribute.h" +#include "render/camera.h" +#include "render/curves.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" -#include "blender_sync.h" -#include "blender_util.h" +#include "blender/blender_sync.h" +#include "blender/blender_util.h" -#include "util_foreach.h" -#include "util_logging.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" CCL_NAMESPACE_BEGIN @@ -411,6 +411,7 @@ static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData, } } + mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size()); mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL); mesh->attributes.remove(ATTR_STD_FACE_NORMAL); mesh->add_face_normals(); @@ -434,8 +435,8 @@ static void ExportCurveTriangleGeometry(Mesh *mesh, if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f) continue; - numverts += (CData->curve_keynum[curve] - 2)*2*resolution + resolution; - numtris += (CData->curve_keynum[curve] - 2)*resolution; + numverts += (CData->curve_keynum[curve] - 1)*resolution + resolution; + numtris += (CData->curve_keynum[curve] - 1)*2*resolution; } } @@ -545,6 +546,7 @@ static void ExportCurveTriangleGeometry(Mesh *mesh, } } + mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size()); mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL); mesh->attributes.remove(ATTR_STD_FACE_NORMAL); mesh->add_face_normals(); @@ -890,7 +892,7 @@ void BlenderSync::sync_curves(Mesh *mesh, } /* obtain general settings */ - bool use_curves = scene->curve_system_manager->use_curves; + const bool use_curves = scene->curve_system_manager->use_curves; if(!(use_curves && b_ob.mode() != b_ob.mode_PARTICLE_EDIT)) { if(!motion) @@ -898,11 +900,11 @@ void BlenderSync::sync_curves(Mesh *mesh, return; } - int primitive = scene->curve_system_manager->primitive; - int triangle_method = scene->curve_system_manager->triangle_method; - int resolution = scene->curve_system_manager->resolution; - size_t vert_num = mesh->verts.size(); - size_t tri_num = mesh->num_triangles(); + const int primitive = scene->curve_system_manager->primitive; + const int triangle_method = scene->curve_system_manager->triangle_method; + const int resolution = scene->curve_system_manager->resolution; + const size_t vert_num = mesh->verts.size(); + const size_t tri_num = mesh->num_triangles(); int used_res = 1; /* extract particle hair data - should be combined with connecting to mesh later*/ diff --git a/intern/cycles/blender/blender_logging.cpp b/intern/cycles/blender/blender_logging.cpp index f4f86929168..d0f82e37662 100644 --- a/intern/cycles/blender/blender_logging.cpp +++ b/intern/cycles/blender/blender_logging.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "CCL_api.h" -#include "util_logging.h" +#include "blender/CCL_api.h" +#include "util/util_logging.h" void CCL_init_logging(const char *argv0) { diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp index 85117cfff7b..e0e89cec65c 100644 --- a/intern/cycles/blender/blender_mesh.cpp +++ b/intern/cycles/blender/blender_mesh.cpp @@ -15,21 +15,22 @@ */ -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "camera.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/camera.h" -#include "blender_sync.h" -#include "blender_session.h" -#include "blender_util.h" +#include "blender/blender_sync.h" +#include "blender/blender_session.h" +#include "blender/blender_util.h" -#include "subd_patch.h" -#include "subd_split.h" +#include "subd/subd_patch.h" +#include "subd/subd_split.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_math.h" +#include "util/util_algorithm.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_math.h" #include "mikktspace.h" @@ -525,69 +526,177 @@ static void attr_create_uv_map(Scene *scene, } /* Create vertex pointiness attributes. */ + +/* Compare vertices by sum of their coordinates. */ +class VertexAverageComparator { +public: + VertexAverageComparator(const array<float3>& verts) + : verts_(verts) { + } + + bool operator()(const int& vert_idx_a, const int& vert_idx_b) + { + const float3 &vert_a = verts_[vert_idx_a]; + const float3 &vert_b = verts_[vert_idx_b]; + if(vert_a == vert_b) { + /* Special case for doubles, so we ensure ordering. */ + return vert_idx_a > vert_idx_b; + } + const float x1 = vert_a.x + vert_a.y + vert_a.z; + const float x2 = vert_b.x + vert_b.y + vert_b.z; + return x1 < x2; + } + +protected: + const array<float3>& verts_; +}; + static void attr_create_pointiness(Scene *scene, Mesh *mesh, BL::Mesh& b_mesh, bool subdivision) { - if(mesh->need_attribute(scene, ATTR_STD_POINTINESS)) { - const int numverts = b_mesh.vertices.length(); - AttributeSet& attributes = (subdivision)? mesh->subd_attributes: mesh->attributes; - Attribute *attr = attributes.add(ATTR_STD_POINTINESS); - float *data = attr->data_float(); - int *counter = new int[numverts]; - float *raw_data = new float[numverts]; - float3 *edge_accum = new float3[numverts]; - - /* Calculate pointiness using single ring neighborhood. */ - memset(counter, 0, sizeof(int) * numverts); - memset(raw_data, 0, sizeof(float) * numverts); - memset(edge_accum, 0, sizeof(float3) * numverts); - BL::Mesh::edges_iterator e; - int i = 0; - for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++i) { - int v0 = b_mesh.edges[i].vertices()[0], - v1 = b_mesh.edges[i].vertices()[1]; - float3 co0 = get_float3(b_mesh.vertices[v0].co()), - co1 = get_float3(b_mesh.vertices[v1].co()); - float3 edge = normalize(co1 - co0); - edge_accum[v0] += edge; - edge_accum[v1] += -edge; - ++counter[v0]; - ++counter[v1]; - } - i = 0; - BL::Mesh::vertices_iterator v; - for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v, ++i) { - if(counter[i] > 0) { - float3 normal = get_float3(b_mesh.vertices[i].normal()); - float angle = safe_acosf(dot(normal, edge_accum[i] / counter[i])); - raw_data[i] = angle * M_1_PI_F; + if(!mesh->need_attribute(scene, ATTR_STD_POINTINESS)) { + return; + } + const int num_verts = b_mesh.vertices.length(); + /* STEP 1: Find out duplicated vertices and point duplicates to a single + * original vertex. + */ + vector<int> sorted_vert_indeices(num_verts); + for(int vert_index = 0; vert_index < num_verts; ++vert_index) { + sorted_vert_indeices[vert_index] = vert_index; + } + VertexAverageComparator compare(mesh->verts); + sort(sorted_vert_indeices.begin(), sorted_vert_indeices.end(), compare); + /* This array stores index of the original vertex for the given vertex + * index. + */ + vector<int> vert_orig_index(num_verts); + for(int sorted_vert_index = 0; + sorted_vert_index < num_verts; + ++sorted_vert_index) + { + const int vert_index = sorted_vert_indeices[sorted_vert_index]; + const float3 &vert_co = mesh->verts[vert_index]; + bool found = false; + for(int other_sorted_vert_index = sorted_vert_index + 1; + other_sorted_vert_index < num_verts; + ++other_sorted_vert_index) + { + const int other_vert_index = + sorted_vert_indeices[other_sorted_vert_index]; + const float3 &other_vert_co = mesh->verts[other_vert_index]; + /* We are too far away now, we wouldn't have duplicate. */ + if((other_vert_co.x + other_vert_co.y + other_vert_co.z) - + (vert_co.x + vert_co.y + vert_co.z) > 3 * FLT_EPSILON) + { + break; } - else { - raw_data[i] = 0.0f; + /* Found duplicate. */ + if(len_squared(other_vert_co - vert_co) < FLT_EPSILON) { + found = true; + vert_orig_index[vert_index] = other_vert_index; + break; } } - - /* Blur vertices to approximate 2 ring neighborhood. */ - memset(counter, 0, sizeof(int) * numverts); - memcpy(data, raw_data, sizeof(float) * numverts); - i = 0; - for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++i) { - int v0 = b_mesh.edges[i].vertices()[0], - v1 = b_mesh.edges[i].vertices()[1]; - data[v0] += raw_data[v1]; - data[v1] += raw_data[v0]; - ++counter[v0]; - ++counter[v1]; + if(!found) { + vert_orig_index[vert_index] = vert_index; + } + } + /* Make sure we always points to the very first orig vertex. */ + for(int vert_index = 0; vert_index < num_verts; ++vert_index) { + int orig_index = vert_orig_index[vert_index]; + while(orig_index != vert_orig_index[orig_index]) { + orig_index = vert_orig_index[orig_index]; } - for(i = 0; i < numverts; ++i) { - data[i] /= counter[i] + 1; + vert_orig_index[vert_index] = orig_index; + } + sorted_vert_indeices.free_memory(); + /* STEP 2: Calculate vertex normals taking into account their possible + * duplicates which gets "welded" together. + */ + vector<float3> vert_normal(num_verts, make_float3(0.0f, 0.0f, 0.0f)); + /* First we accumulate all vertex normals in the original index. */ + for(int vert_index = 0; vert_index < num_verts; ++vert_index) { + const float3 normal = get_float3(b_mesh.vertices[vert_index].normal()); + const int orig_index = vert_orig_index[vert_index]; + vert_normal[orig_index] += normal; + } + /* Then we normalize the accumulated result and flush it to all duplicates + * as well. + */ + for(int vert_index = 0; vert_index < num_verts; ++vert_index) { + const int orig_index = vert_orig_index[vert_index]; + vert_normal[vert_index] = normalize(vert_normal[orig_index]); + } + /* STEP 3: Calculate pointiness using single ring neighborhood. */ + vector<int> counter(num_verts, 0); + vector<float> raw_data(num_verts, 0.0f); + vector<float3> edge_accum(num_verts, make_float3(0.0f, 0.0f, 0.0f)); + BL::Mesh::edges_iterator e; + EdgeMap visited_edges; + int edge_index = 0; + memset(&counter[0], 0, sizeof(int) * counter.size()); + for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++edge_index) { + const int v0 = vert_orig_index[b_mesh.edges[edge_index].vertices()[0]], + v1 = vert_orig_index[b_mesh.edges[edge_index].vertices()[1]]; + if(visited_edges.exists(v0, v1)) { + continue; + } + visited_edges.insert(v0, v1); + float3 co0 = get_float3(b_mesh.vertices[v0].co()), + co1 = get_float3(b_mesh.vertices[v1].co()); + float3 edge = normalize(co1 - co0); + edge_accum[v0] += edge; + edge_accum[v1] += -edge; + ++counter[v0]; + ++counter[v1]; + } + for(int vert_index = 0; vert_index < num_verts; ++vert_index) { + const int orig_index = vert_orig_index[vert_index]; + if(orig_index != vert_index) { + /* Skip duplicates, they'll be overwritten later on. */ + continue; + } + if(counter[vert_index] > 0) { + const float3 normal = vert_normal[vert_index]; + const float angle = + safe_acosf(dot(normal, + edge_accum[vert_index] / counter[vert_index])); + raw_data[vert_index] = angle * M_1_PI_F; + } + else { + raw_data[vert_index] = 0.0f; } - - delete [] counter; - delete [] raw_data; - delete [] edge_accum; + } + /* STEP 3: Blur vertices to approximate 2 ring neighborhood. */ + AttributeSet& attributes = (subdivision)? mesh->subd_attributes: mesh->attributes; + Attribute *attr = attributes.add(ATTR_STD_POINTINESS); + float *data = attr->data_float(); + memcpy(data, &raw_data[0], sizeof(float) * raw_data.size()); + memset(&counter[0], 0, sizeof(int) * counter.size()); + edge_index = 0; + visited_edges.clear(); + for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++edge_index) { + const int v0 = vert_orig_index[b_mesh.edges[edge_index].vertices()[0]], + v1 = vert_orig_index[b_mesh.edges[edge_index].vertices()[1]]; + if(visited_edges.exists(v0, v1)) { + continue; + } + visited_edges.insert(v0, v1); + data[v0] += raw_data[v1]; + data[v1] += raw_data[v0]; + ++counter[v0]; + ++counter[v1]; + } + for(int vert_index = 0; vert_index < num_verts; ++vert_index) { + data[vert_index] /= counter[vert_index] + 1; + } + /* STEP 4: Copy attribute to the duplicated vertices. */ + for(int vert_index = 0; vert_index < num_verts; ++vert_index) { + const int orig_index = vert_orig_index[vert_index]; + data[vert_index] = data[orig_index]; } } @@ -656,9 +765,6 @@ static void create_mesh(Scene *scene, generated[i++] = get_float3(v->undeformed_co())*size - loc; } - /* Create needed vertex attributes. */ - attr_create_pointiness(scene, mesh, b_mesh, subdivision); - /* create faces */ vector<int> nverts(numfaces); vector<int> face_flags(numfaces, FACE_FLAG_NONE); @@ -671,6 +777,15 @@ static void create_mesh(Scene *scene, int shader = clamp(f->material_index(), 0, used_shaders.size()-1); bool smooth = f->use_smooth() || use_loop_normals; + if(use_loop_normals) { + BL::Array<float, 12> loop_normals = f->split_normals(); + for(int i = 0; i < n; i++) { + N[vi[i]] = make_float3(loop_normals[i * 3], + loop_normals[i * 3 + 1], + loop_normals[i * 3 + 2]); + } + } + /* Create triangles. * * NOTE: Autosmooth is already taken care about. @@ -704,7 +819,7 @@ static void create_mesh(Scene *scene, int shader = clamp(p->material_index(), 0, used_shaders.size()-1); bool smooth = p->use_smooth() || use_loop_normals; - vi.reserve(n); + vi.resize(n); for(int i = 0; i < n; i++) { /* NOTE: Autosmooth is already taken care about. */ vi[i] = b_mesh.loops[p->loop_start() + i].vertex_index(); @@ -718,6 +833,7 @@ static void create_mesh(Scene *scene, /* Create all needed attributes. * The calculate functions will check whether they're needed or not. */ + attr_create_pointiness(scene, mesh, b_mesh, subdivision); attr_create_vertex_color(scene, mesh, b_mesh, nverts, face_flags, subdivision); attr_create_uv_map(scene, mesh, b_mesh, nverts, face_flags, subdivision, subdivide_uvs); @@ -1178,4 +1294,3 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob, } CCL_NAMESPACE_END - diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp index 637cf7abda8..d05699236cc 100644 --- a/intern/cycles/blender/blender_object.cpp +++ b/intern/cycles/blender/blender_object.cpp @@ -14,24 +14,24 @@ * limitations under the License. */ -#include "camera.h" -#include "integrator.h" -#include "graph.h" -#include "light.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "nodes.h" -#include "particles.h" -#include "shader.h" - -#include "blender_object_cull.h" -#include "blender_sync.h" -#include "blender_util.h" - -#include "util_foreach.h" -#include "util_hash.h" -#include "util_logging.h" +#include "render/camera.h" +#include "render/integrator.h" +#include "render/graph.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/nodes.h" +#include "render/particles.h" +#include "render/shader.h" + +#include "blender/blender_object_cull.h" +#include "blender/blender_sync.h" +#include "blender/blender_util.h" + +#include "util/util_foreach.h" +#include "util/util_hash.h" +#include "util/util_logging.h" CCL_NAMESPACE_BEGIN @@ -343,6 +343,13 @@ Object *BlenderSync::sync_object(BL::Object& b_parent, object_updated = true; } + PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles"); + bool is_shadow_catcher = get_boolean(cobject, "is_shadow_catcher"); + if(is_shadow_catcher != object->is_shadow_catcher) { + object->is_shadow_catcher = is_shadow_catcher; + object_updated = true; + } + /* object sync * transform comparison should not be needed, but duplis don't work perfect * in the depsgraph and may not signal changes, so this is a workaround */ diff --git a/intern/cycles/blender/blender_object_cull.cpp b/intern/cycles/blender/blender_object_cull.cpp index 08918dd1a49..0333c027f70 100644 --- a/intern/cycles/blender/blender_object_cull.cpp +++ b/intern/cycles/blender/blender_object_cull.cpp @@ -16,9 +16,9 @@ #include <cstdlib> -#include "camera.h" +#include "render/camera.h" -#include "blender_object_cull.h" +#include "blender/blender_object_cull.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/blender/blender_object_cull.h b/intern/cycles/blender/blender_object_cull.h index b6f0ca5cd31..2147877a860 100644 --- a/intern/cycles/blender/blender_object_cull.h +++ b/intern/cycles/blender/blender_object_cull.h @@ -17,8 +17,8 @@ #ifndef __BLENDER_OBJECT_CULL_H__ #define __BLENDER_OBJECT_CULL_H__ -#include "blender_sync.h" -#include "util_types.h" +#include "blender/blender_sync.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/blender/blender_particles.cpp b/intern/cycles/blender/blender_particles.cpp index dd2900a8d5b..00f8cb3cf1b 100644 --- a/intern/cycles/blender/blender_particles.cpp +++ b/intern/cycles/blender/blender_particles.cpp @@ -14,14 +14,14 @@ * limitations under the License. */ -#include "mesh.h" -#include "object.h" -#include "particles.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/particles.h" -#include "blender_sync.h" -#include "blender_util.h" +#include "blender/blender_sync.h" +#include "blender/blender_util.h" -#include "util_foreach.h" +#include "util/util_foreach.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 438abc49f88..d509e9de981 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -16,21 +16,21 @@ #include <Python.h> -#include "CCL_api.h" +#include "blender/CCL_api.h" -#include "blender_sync.h" -#include "blender_session.h" +#include "blender/blender_sync.h" +#include "blender/blender_session.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_md5.h" -#include "util_opengl.h" -#include "util_path.h" -#include "util_string.h" -#include "util_types.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_md5.h" +#include "util/util_opengl.h" +#include "util/util_path.h" +#include "util/util_string.h" +#include "util/util_types.h" #ifdef WITH_OSL -#include "osl.h" +#include "render/osl.h" #include <OSL/oslquery.h> #include <OSL/oslconfig.h> @@ -67,8 +67,10 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene) flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3"); flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2"); flags.cpu.qbvh = get_boolean(cscene, "debug_use_qbvh"); + flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel"); /* Synchronize CUDA flags. */ flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile"); + flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel"); /* Synchronize OpenCL kernel type. */ switch(get_enum(cscene, "debug_opencl_kernel_type")) { case 0: @@ -104,6 +106,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene) } /* Synchronize other OpenCL flags. */ flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug"); + flags.opencl.single_program = get_boolean(cscene, "debug_opencl_kernel_single_program"); return flags.opencl.device_type != opencl_device_type || flags.opencl.kernel_type != opencl_kernel_type; } @@ -641,7 +644,7 @@ static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/ Py_RETURN_NONE; } -static PyObject *set_resumable_chunks_func(PyObject * /*self*/, PyObject *args) +static PyObject *set_resumable_chunk_func(PyObject * /*self*/, PyObject *args) { int num_resumable_chunks, current_resumable_chunk; if(!PyArg_ParseTuple(args, "ii", @@ -676,6 +679,53 @@ static PyObject *set_resumable_chunks_func(PyObject * /*self*/, PyObject *args) Py_RETURN_NONE; } +static PyObject *set_resumable_chunk_range_func(PyObject * /*self*/, PyObject *args) +{ + int num_chunks, start_chunk, end_chunk; + if(!PyArg_ParseTuple(args, "iii", + &num_chunks, + &start_chunk, + &end_chunk)) { + Py_RETURN_NONE; + } + + if(num_chunks <= 0) { + fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n"); + abort(); + Py_RETURN_NONE; + } + if(start_chunk < 1 || start_chunk > num_chunks) { + fprintf(stderr, "Cycles: Bad value for start chunk number.\n"); + abort(); + Py_RETURN_NONE; + } + if(end_chunk < 1 || end_chunk > num_chunks) { + fprintf(stderr, "Cycles: Bad value for start chunk number.\n"); + abort(); + Py_RETURN_NONE; + } + if(start_chunk > end_chunk) { + fprintf(stderr, "Cycles: End chunk should be higher than start one.\n"); + abort(); + Py_RETURN_NONE; + } + + VLOG(1) << "Initialized resumable render: " + << "num_resumable_chunks=" << num_chunks << ", " + << "start_resumable_chunk=" << start_chunk + << "end_resumable_chunk=" << end_chunk; + BlenderSession::num_resumable_chunks = num_chunks; + BlenderSession::start_resumable_chunk = start_chunk; + BlenderSession::end_resumable_chunk = end_chunk; + + printf("Cycles: Will render chunks %d to %d of %d\n", + start_chunk, + end_chunk, + num_chunks); + + Py_RETURN_NONE; +} + static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/) { vector<DeviceInfo>& devices = Device::available_devices(); @@ -715,7 +765,8 @@ static PyMethodDef methods[] = { {"debug_flags_reset", debug_flags_reset_func, METH_NOARGS, ""}, /* Resumable render */ - {"set_resumable_chunks", set_resumable_chunks_func, METH_VARARGS, ""}, + {"set_resumable_chunk", set_resumable_chunk_func, METH_VARARGS, ""}, + {"set_resumable_chunk_range", set_resumable_chunk_range_func, METH_VARARGS, ""}, /* Compute Device selection */ {"get_device_types", get_device_types_func, METH_VARARGS, ""}, diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index 2f30cbd961f..26f9bccd95d 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -16,36 +16,38 @@ #include <stdlib.h> -#include "background.h" -#include "buffers.h" -#include "camera.h" -#include "device.h" -#include "integrator.h" -#include "film.h" -#include "light.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "session.h" -#include "shader.h" - -#include "util_color.h" -#include "util_foreach.h" -#include "util_function.h" -#include "util_hash.h" -#include "util_logging.h" -#include "util_progress.h" -#include "util_time.h" - -#include "blender_sync.h" -#include "blender_session.h" -#include "blender_util.h" +#include "render/background.h" +#include "render/buffers.h" +#include "render/camera.h" +#include "device/device.h" +#include "render/integrator.h" +#include "render/film.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/session.h" +#include "render/shader.h" + +#include "util/util_color.h" +#include "util/util_foreach.h" +#include "util/util_function.h" +#include "util/util_hash.h" +#include "util/util_logging.h" +#include "util/util_progress.h" +#include "util/util_time.h" + +#include "blender/blender_sync.h" +#include "blender/blender_session.h" +#include "blender/blender_util.h" CCL_NAMESPACE_BEGIN bool BlenderSession::headless = false; int BlenderSession::num_resumable_chunks = 0; int BlenderSession::current_resumable_chunk = 0; +int BlenderSession::start_resumable_chunk = 0; +int BlenderSession::end_resumable_chunk = 0; BlenderSession::BlenderSession(BL::RenderEngine& b_engine, BL::UserPreferences& b_userpref, @@ -68,6 +70,7 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine, background = true; last_redraw_time = 0.0; start_resize_time = 0.0; + last_status_time = 0.0; } BlenderSession::BlenderSession(BL::RenderEngine& b_engine, @@ -93,6 +96,7 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine, background = false; last_redraw_time = 0.0; start_resize_time = 0.0; + last_status_time = 0.0; } BlenderSession::~BlenderSession() @@ -989,10 +993,14 @@ void BlenderSession::update_status_progress() if(substatus.size() > 0) status += " | " + substatus; - if(status != last_status) { + double current_time = time_dt(); + /* When rendering in a window, redraw the status at least once per second to keep the elapsed and remaining time up-to-date. + * For headless rendering, only report when something significant changes to keep the console output readable. */ + if(status != last_status || (!headless && (current_time - last_status_time) > 1.0)) { b_engine.update_stats("", (timestatus + scene + status).c_str()); b_engine.update_memory_stats(mem_used, mem_peak); last_status = status; + last_status_time = current_time; } if(progress != last_progress) { b_engine.update_progress(progress); @@ -1342,9 +1350,21 @@ void BlenderSession::update_resumable_tile_manager(int num_samples) return; } - int num_samples_per_chunk = (int)ceilf((float)num_samples / num_resumable_chunks); - int range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1); - int range_num_samples = num_samples_per_chunk; + const int num_samples_per_chunk = (int)ceilf((float)num_samples / num_resumable_chunks); + + int range_start_sample, range_num_samples; + if(current_resumable_chunk != 0) { + /* Single chunk rendering. */ + range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1); + range_num_samples = num_samples_per_chunk; + } + else { + /* Ranged-chunks. */ + const int num_chunks = end_resumable_chunk - start_resumable_chunk + 1; + range_start_sample = num_samples_per_chunk * (start_resumable_chunk - 1); + range_num_samples = num_chunks * num_samples_per_chunk; + } + /* Make sure we don't overshoot. */ if(range_start_sample + range_num_samples > num_samples) { range_num_samples = num_samples - range_num_samples; } diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h index 82fe218b4ce..22b21a18f2e 100644 --- a/intern/cycles/blender/blender_session.h +++ b/intern/cycles/blender/blender_session.h @@ -17,12 +17,12 @@ #ifndef __BLENDER_SESSION_H__ #define __BLENDER_SESSION_H__ -#include "device.h" -#include "scene.h" -#include "session.h" -#include "bake.h" +#include "device/device.h" +#include "render/scene.h" +#include "render/session.h" +#include "render/bake.h" -#include "util_vector.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -113,6 +113,7 @@ public: string last_status; string last_error; float last_progress; + double last_status_time; int width, height; double start_resize_time; @@ -137,6 +138,10 @@ public: /* Current resumable chunk index to render. */ static int current_resumable_chunk; + /* Alternative to single-chunk rendering to render a range of chunks. */ + static int start_resumable_chunk; + static int end_resumable_chunk; + protected: void do_write_update_render_result(BL::RenderResult& b_rr, BL::RenderLayer& b_rlay, diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp index c9d4236a7f2..1ec82445b20 100644 --- a/intern/cycles/blender/blender_shader.cpp +++ b/intern/cycles/blender/blender_shader.cpp @@ -14,20 +14,21 @@ * limitations under the License. */ -#include "background.h" -#include "graph.h" -#include "light.h" -#include "nodes.h" -#include "osl.h" -#include "scene.h" -#include "shader.h" - -#include "blender_texture.h" -#include "blender_sync.h" -#include "blender_util.h" - -#include "util_debug.h" -#include "util_string.h" +#include "render/background.h" +#include "render/graph.h" +#include "render/light.h" +#include "render/nodes.h" +#include "render/osl.h" +#include "render/scene.h" +#include "render/shader.h" + +#include "blender/blender_texture.h" +#include "blender/blender_sync.h" +#include "blender/blender_util.h" + +#include "util/util_debug.h" +#include "util/util_string.h" +#include "util/util_task.h" CCL_NAMESPACE_BEGIN @@ -622,7 +623,8 @@ static ShaderNode *add_node(Scene *scene, bool is_builtin = b_image.packed_file() || b_image.source() == BL::Image::source_GENERATED || b_image.source() == BL::Image::source_MOVIE || - b_engine.is_preview(); + (b_engine.is_preview() && + b_image.source() != BL::Image::source_SEQUENCE); if(is_builtin) { /* for builtin images we're using image datablock name to find an image to @@ -675,7 +677,8 @@ static ShaderNode *add_node(Scene *scene, bool is_builtin = b_image.packed_file() || b_image.source() == BL::Image::source_GENERATED || b_image.source() == BL::Image::source_MOVIE || - b_engine.is_preview(); + (b_engine.is_preview() && + b_image.source() != BL::Image::source_SEQUENCE); if(is_builtin) { int scene_frame = b_scene.frame_current(); @@ -1168,6 +1171,13 @@ static void add_nodes(Scene *scene, /* Sync Materials */ +void BlenderSync::sync_materials_simpligy(Shader *shader) +{ + ShaderGraph *graph = shader->graph; + graph->simplify(scene); + shader->tag_update(scene); +} + void BlenderSync::sync_materials(bool update_all) { shader_map.set_default(scene->default_surface); @@ -1175,6 +1185,8 @@ void BlenderSync::sync_materials(bool update_all) /* material loop */ BL::BlendData::materials_iterator b_mat; + TaskPool pool; + for(b_data.materials.begin(b_mat); b_mat != b_data.materials.end(); ++b_mat) { Shader *shader; @@ -1210,9 +1222,31 @@ void BlenderSync::sync_materials(bool update_all) shader->displacement_method = (experimental) ? get_displacement_method(cmat) : DISPLACE_BUMP; shader->set_graph(graph); - shader->tag_update(scene); + + /* By simplifying the shader graph as soon as possible, some + * redundant shader nodes might be removed which prevents loading + * unnecessary attributes later. + * + * However, since graph simplification also accounts for e.g. mix + * weight, this would cause frequent expensive resyncs in interactive + * sessions, so for those sessions optimization is only performed + * right before compiling. + */ + if(!preview) { + pool.push(function_bind(&BlenderSync::sync_materials_simpligy, + this, + shader)); + } + else { + /* NOTE: Update tagging can access links which are being + * optimized out. + */ + shader->tag_update(scene); + } } } + + pool.wait_work(); } /* Sync World */ diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index d8043105cd8..3b071bf0e7d 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -14,29 +14,29 @@ * limitations under the License. */ -#include "background.h" -#include "camera.h" -#include "film.h" -#include "graph.h" -#include "integrator.h" -#include "light.h" -#include "mesh.h" -#include "nodes.h" -#include "object.h" -#include "scene.h" -#include "shader.h" -#include "curves.h" - -#include "device.h" - -#include "blender_sync.h" -#include "blender_session.h" -#include "blender_util.h" - -#include "util_debug.h" -#include "util_foreach.h" -#include "util_opengl.h" -#include "util_hash.h" +#include "render/background.h" +#include "render/camera.h" +#include "render/film.h" +#include "render/graph.h" +#include "render/integrator.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/nodes.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/shader.h" +#include "render/curves.h" + +#include "device/device.h" + +#include "blender/blender_sync.h" +#include "blender/blender_session.h" +#include "blender/blender_util.h" + +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_opengl.h" +#include "util/util_hash.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h index 6984cbda259..62e2f8f563a 100644 --- a/intern/cycles/blender/blender_sync.h +++ b/intern/cycles/blender/blender_sync.h @@ -22,15 +22,15 @@ #include "RNA_access.h" #include "RNA_blender_cpp.h" -#include "blender_util.h" +#include "blender/blender_util.h" -#include "scene.h" -#include "session.h" +#include "render/scene.h" +#include "render/session.h" -#include "util_map.h" -#include "util_set.h" -#include "util_transform.h" -#include "util_vector.h" +#include "util/util_map.h" +#include "util/util_set.h" +#include "util/util_transform.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -96,6 +96,7 @@ public: private: /* sync */ void sync_lamps(bool update_all); + void sync_materials_simpligy(Shader *shader); void sync_materials(bool update_all); void sync_objects(BL::SpaceView3D& b_v3d, float motion_time = 0.0f); void sync_motion(BL::RenderSettings& b_render, diff --git a/intern/cycles/blender/blender_texture.cpp b/intern/cycles/blender/blender_texture.cpp index 3807e683c7c..b2e27b76189 100644 --- a/intern/cycles/blender/blender_texture.cpp +++ b/intern/cycles/blender/blender_texture.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "blender_texture.h" +#include "blender/blender_texture.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/blender/blender_texture.h b/intern/cycles/blender/blender_texture.h index ad96f9db8ed..734231a85ec 100644 --- a/intern/cycles/blender/blender_texture.h +++ b/intern/cycles/blender/blender_texture.h @@ -18,7 +18,7 @@ #define __BLENDER_TEXTURE_H__ #include <stdlib.h> -#include "blender_sync.h" +#include "blender/blender_sync.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h index b67834cdea3..4d575330520 100644 --- a/intern/cycles/blender/blender_util.h +++ b/intern/cycles/blender/blender_util.h @@ -17,14 +17,15 @@ #ifndef __BLENDER_UTIL_H__ #define __BLENDER_UTIL_H__ -#include "mesh.h" +#include "render/mesh.h" -#include "util_map.h" -#include "util_path.h" -#include "util_set.h" -#include "util_transform.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_algorithm.h" +#include "util/util_map.h" +#include "util/util_path.h" +#include "util/util_set.h" +#include "util/util_transform.h" +#include "util/util_types.h" +#include "util/util_vector.h" /* Hacks to hook into Blender API * todo: clean this up ... */ @@ -78,7 +79,7 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data, me.calc_normals_split(); } else { - me.split_faces(); + me.split_faces(false); } } if(subdivision_type == Mesh::SUBDIVISION_NONE) { @@ -786,6 +787,35 @@ struct ParticleSystemKey { } }; +class EdgeMap { +public: + EdgeMap() { + } + + void clear() { + edges_.clear(); + } + + void insert(int v0, int v1) { + get_sorted_verts(v0, v1); + edges_.insert(std::pair<int, int>(v0, v1)); + } + + bool exists(int v0, int v1) { + get_sorted_verts(v0, v1); + return edges_.find(std::pair<int, int>(v0, v1)) != edges_.end(); + } + +protected: + void get_sorted_verts(int& v0, int& v1) { + if(v0 > v1) { + swap(v0, v1); + } + } + + set< std::pair<int, int> > edges_; +}; + CCL_NAMESPACE_END #endif /* __BLENDER_UTIL_H__ */ diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt index 92e48f0d87f..4701d75350a 100644 --- a/intern/cycles/bvh/CMakeLists.txt +++ b/intern/cycles/bvh/CMakeLists.txt @@ -1,12 +1,6 @@ set(INC - . - ../graph - ../kernel - ../kernel/svm - ../render - ../util - ../device + .. ) set(INC_SYS diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index 874a4246d1d..58348d16746 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -15,25 +15,25 @@ * limitations under the License. */ -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "curves.h" - -#include "bvh.h" -#include "bvh_build.h" -#include "bvh_node.h" -#include "bvh_params.h" -#include "bvh_unaligned.h" - -#include "util_debug.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_map.h" -#include "util_progress.h" -#include "util_system.h" -#include "util_types.h" -#include "util_math.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/curves.h" + +#include "bvh/bvh.h" +#include "bvh/bvh_build.h" +#include "bvh/bvh_node.h" +#include "bvh/bvh_params.h" +#include "bvh/bvh_unaligned.h" + +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_progress.h" +#include "util/util_system.h" +#include "util/util_types.h" +#include "util/util_math.h" CCL_NAMESPACE_BEGIN @@ -67,7 +67,7 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects) if(params.use_qbvh) return new QBVH(params, objects); else - return new RegularBVH(params, objects); + return new BinaryBVH(params, objects); } /* Building */ @@ -81,6 +81,7 @@ void BVH::build(Progress& progress) pack.prim_type, pack.prim_index, pack.prim_object, + pack.prim_time, params, progress); BVHNode *root = bvh_build.run(); @@ -256,6 +257,10 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) pack.leaf_nodes.resize(leaf_nodes_size); pack.object_node.resize(objects.size()); + if(params.num_motion_curve_steps > 0 || params.num_motion_triangle_steps > 0) { + pack.prim_time.resize(prim_index_size); + } + int *pack_prim_index = (pack.prim_index.size())? &pack.prim_index[0]: NULL; int *pack_prim_type = (pack.prim_type.size())? &pack.prim_type[0]: NULL; int *pack_prim_object = (pack.prim_object.size())? &pack.prim_object[0]: NULL; @@ -264,6 +269,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) uint *pack_prim_tri_index = (pack.prim_tri_index.size())? &pack.prim_tri_index[0]: NULL; int4 *pack_nodes = (pack.nodes.size())? &pack.nodes[0]: NULL; int4 *pack_leaf_nodes = (pack.leaf_nodes.size())? &pack.leaf_nodes[0]: NULL; + float2 *pack_prim_time = (pack.prim_time.size())? &pack.prim_time[0]: NULL; /* merge */ foreach(Object *ob, objects) { @@ -309,6 +315,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) int *bvh_prim_type = &bvh->pack.prim_type[0]; uint *bvh_prim_visibility = &bvh->pack.prim_visibility[0]; uint *bvh_prim_tri_index = &bvh->pack.prim_tri_index[0]; + float2 *bvh_prim_time = bvh->pack.prim_time.size()? &bvh->pack.prim_time[0]: NULL; for(size_t i = 0; i < bvh_prim_index_size; i++) { if(bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) { @@ -324,6 +331,9 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) pack_prim_type[pack_prim_index_offset] = bvh_prim_type[i]; pack_prim_visibility[pack_prim_index_offset] = bvh_prim_visibility[i]; pack_prim_object[pack_prim_index_offset] = 0; // unused for instances + if(bvh_prim_time != NULL) { + pack_prim_time[pack_prim_index_offset] = bvh_prim_time[i]; + } pack_prim_index_offset++; } } @@ -414,64 +424,64 @@ static bool node_bvh_is_unaligned(const BVHNode *node) { const BVHNode *node0 = node->get_child(0), *node1 = node->get_child(1); - return node0->is_unaligned() || node1->is_unaligned(); + return node0->is_unaligned || node1->is_unaligned; } -RegularBVH::RegularBVH(const BVHParams& params_, const vector<Object*>& objects_) +BinaryBVH::BinaryBVH(const BVHParams& params_, const vector<Object*>& objects_) : BVH(params_, objects_) { } -void RegularBVH::pack_leaf(const BVHStackEntry& e, - const LeafNode *leaf) +void BinaryBVH::pack_leaf(const BVHStackEntry& e, + const LeafNode *leaf) { assert(e.idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size()); float4 data[BVH_NODE_LEAF_SIZE]; memset(data, 0, sizeof(data)); - if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) { + if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) { /* object */ - data[0].x = __int_as_float(~(leaf->m_lo)); + data[0].x = __int_as_float(~(leaf->lo)); data[0].y = __int_as_float(0); } else { /* triangle */ - data[0].x = __int_as_float(leaf->m_lo); - data[0].y = __int_as_float(leaf->m_hi); + data[0].x = __int_as_float(leaf->lo); + data[0].y = __int_as_float(leaf->hi); } - data[0].z = __uint_as_float(leaf->m_visibility); + data[0].z = __uint_as_float(leaf->visibility); if(leaf->num_triangles() != 0) { - data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]); + data[0].w = __uint_as_float(pack.prim_type[leaf->lo]); } memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE); } -void RegularBVH::pack_inner(const BVHStackEntry& e, - const BVHStackEntry& e0, - const BVHStackEntry& e1) +void BinaryBVH::pack_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1) { - if(e0.node->is_unaligned() || e1.node->is_unaligned()) { + if(e0.node->is_unaligned || e1.node->is_unaligned) { pack_unaligned_inner(e, e0, e1); } else { pack_aligned_inner(e, e0, e1); } } -void RegularBVH::pack_aligned_inner(const BVHStackEntry& e, - const BVHStackEntry& e0, - const BVHStackEntry& e1) +void BinaryBVH::pack_aligned_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1) { pack_aligned_node(e.idx, - e0.node->m_bounds, e1.node->m_bounds, + e0.node->bounds, e1.node->bounds, e0.encodeIdx(), e1.encodeIdx(), - e0.node->m_visibility, e1.node->m_visibility); + e0.node->visibility, e1.node->visibility); } -void RegularBVH::pack_aligned_node(int idx, - const BoundBox& b0, - const BoundBox& b1, - int c0, int c1, - uint visibility0, uint visibility1) +void BinaryBVH::pack_aligned_node(int idx, + const BoundBox& b0, + const BoundBox& b1, + int c0, int c1, + uint visibility0, uint visibility1) { assert(idx + BVH_NODE_SIZE <= pack.nodes.size()); assert(c0 < 0 || c0 < pack.nodes.size()); @@ -498,26 +508,26 @@ void RegularBVH::pack_aligned_node(int idx, memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE); } -void RegularBVH::pack_unaligned_inner(const BVHStackEntry& e, - const BVHStackEntry& e0, - const BVHStackEntry& e1) +void BinaryBVH::pack_unaligned_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1) { pack_unaligned_node(e.idx, e0.node->get_aligned_space(), e1.node->get_aligned_space(), - e0.node->m_bounds, - e1.node->m_bounds, + e0.node->bounds, + e1.node->bounds, e0.encodeIdx(), e1.encodeIdx(), - e0.node->m_visibility, e1.node->m_visibility); + e0.node->visibility, e1.node->visibility); } -void RegularBVH::pack_unaligned_node(int idx, - const Transform& aligned_space0, - const Transform& aligned_space1, - const BoundBox& bounds0, - const BoundBox& bounds1, - int c0, int c1, - uint visibility0, uint visibility1) +void BinaryBVH::pack_unaligned_node(int idx, + const Transform& aligned_space0, + const Transform& aligned_space1, + const BoundBox& bounds0, + const BoundBox& bounds1, + int c0, int c1, + uint visibility0, uint visibility1) { assert(idx + BVH_UNALIGNED_NODE_SIZE <= pack.nodes.size()); assert(c0 < 0 || c0 < pack.nodes.size()); @@ -543,7 +553,7 @@ void RegularBVH::pack_unaligned_node(int idx, memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE); } -void RegularBVH::pack_nodes(const BVHNode *root) +void BinaryBVH::pack_nodes(const BVHNode *root) { const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT); const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); @@ -620,7 +630,7 @@ void RegularBVH::pack_nodes(const BVHNode *root) pack.root_index = (root->is_leaf())? -1: 0; } -void RegularBVH::refit_nodes() +void BinaryBVH::refit_nodes() { assert(!params.top_level); @@ -629,7 +639,7 @@ void RegularBVH::refit_nodes() refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility); } -void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) +void BinaryBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) { if(leaf) { assert(idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size()); @@ -759,18 +769,18 @@ static bool node_qbvh_is_unaligned(const BVHNode *node) *node1 = node->get_child(1); bool has_unaligned = false; if(node0->is_leaf()) { - has_unaligned |= node0->is_unaligned(); + has_unaligned |= node0->is_unaligned; } else { - has_unaligned |= node0->get_child(0)->is_unaligned(); - has_unaligned |= node0->get_child(1)->is_unaligned(); + has_unaligned |= node0->get_child(0)->is_unaligned; + has_unaligned |= node0->get_child(1)->is_unaligned; } if(node1->is_leaf()) { - has_unaligned |= node1->is_unaligned(); + has_unaligned |= node1->is_unaligned; } else { - has_unaligned |= node1->get_child(0)->is_unaligned(); - has_unaligned |= node1->get_child(1)->is_unaligned(); + has_unaligned |= node1->get_child(0)->is_unaligned; + has_unaligned |= node1->get_child(1)->is_unaligned; } return has_unaligned; } @@ -785,19 +795,19 @@ void QBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf) { float4 data[BVH_QNODE_LEAF_SIZE]; memset(data, 0, sizeof(data)); - if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) { + if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) { /* object */ - data[0].x = __int_as_float(~(leaf->m_lo)); + data[0].x = __int_as_float(~(leaf->lo)); data[0].y = __int_as_float(0); } else { /* triangle */ - data[0].x = __int_as_float(leaf->m_lo); - data[0].y = __int_as_float(leaf->m_hi); + data[0].x = __int_as_float(leaf->lo); + data[0].y = __int_as_float(leaf->hi); } - data[0].z = __uint_as_float(leaf->m_visibility); + data[0].z = __uint_as_float(leaf->visibility); if(leaf->num_triangles() != 0) { - data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]); + data[0].w = __uint_as_float(pack.prim_type[leaf->lo]); } memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE); @@ -813,7 +823,7 @@ void QBVH::pack_inner(const BVHStackEntry& e, */ if(params.use_unaligned_nodes) { for(int i = 0; i < num; i++) { - if(en[i].node->is_unaligned()) { + if(en[i].node->is_unaligned) { has_unaligned = true; break; } @@ -838,15 +848,15 @@ void QBVH::pack_aligned_inner(const BVHStackEntry& e, BoundBox bounds[4]; int child[4]; for(int i = 0; i < num; ++i) { - bounds[i] = en[i].node->m_bounds; + bounds[i] = en[i].node->bounds; child[i] = en[i].encodeIdx(); } pack_aligned_node(e.idx, bounds, child, - e.node->m_visibility, - e.node->m_time_from, - e.node->m_time_to, + e.node->visibility, + e.node->time_from, + e.node->time_to, num); } @@ -907,16 +917,16 @@ void QBVH::pack_unaligned_inner(const BVHStackEntry& e, int child[4]; for(int i = 0; i < num; ++i) { aligned_space[i] = en[i].node->get_aligned_space(); - bounds[i] = en[i].node->m_bounds; + bounds[i] = en[i].node->bounds; child[i] = en[i].encodeIdx(); } pack_unaligned_node(e.idx, aligned_space, bounds, child, - e.node->m_visibility, - e.node->m_time_from, - e.node->m_time_to, + e.node->visibility, + e.node->time_from, + e.node->time_to, num); } diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h index 35f4d305883..60bc62ee6e4 100644 --- a/intern/cycles/bvh/bvh.h +++ b/intern/cycles/bvh/bvh.h @@ -18,10 +18,10 @@ #ifndef __BVH_H__ #define __BVH_H__ -#include "bvh_params.h" +#include "bvh/bvh_params.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -68,6 +68,8 @@ struct PackedBVH { array<int> prim_index; /* mapping from BVH primitive index, to the object id of that primitive. */ array<int> prim_object; + /* Time range of BVH primitive. */ + array<float2> prim_time; /* index of the root node. */ int root_index; @@ -108,15 +110,15 @@ protected: virtual void refit_nodes() = 0; }; -/* Regular BVH +/* Binary BVH * * Typical BVH with each node having two children. */ -class RegularBVH : public BVH { +class BinaryBVH : public BVH { protected: /* constructor */ friend class BVH; - RegularBVH(const BVHParams& params, const vector<Object*>& objects); + BinaryBVH(const BVHParams& params, const vector<Object*>& objects); /* pack */ void pack_nodes(const BVHNode *root); diff --git a/intern/cycles/bvh/bvh_binning.cpp b/intern/cycles/bvh/bvh_binning.cpp index 5ddd7349f7b..3226008f511 100644 --- a/intern/cycles/bvh/bvh_binning.cpp +++ b/intern/cycles/bvh/bvh_binning.cpp @@ -19,11 +19,11 @@ #include <stdlib.h> -#include "bvh_binning.h" +#include "bvh/bvh_binning.h" -#include "util_algorithm.h" -#include "util_boundbox.h" -#include "util_types.h" +#include "util/util_algorithm.h" +#include "util/util_boundbox.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/bvh/bvh_binning.h b/intern/cycles/bvh/bvh_binning.h index 52955f70151..285f9c56a62 100644 --- a/intern/cycles/bvh/bvh_binning.h +++ b/intern/cycles/bvh/bvh_binning.h @@ -18,10 +18,10 @@ #ifndef __BVH_BINNING_H__ #define __BVH_BINNING_H__ -#include "bvh_params.h" -#include "bvh_unaligned.h" +#include "bvh/bvh_params.h" +#include "bvh/bvh_unaligned.h" -#include "util_types.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp index a2f8b33cb0b..95c71b54da0 100644 --- a/intern/cycles/bvh/bvh_build.cpp +++ b/intern/cycles/bvh/bvh_build.cpp @@ -15,26 +15,26 @@ * limitations under the License. */ -#include "bvh_binning.h" -#include "bvh_build.h" -#include "bvh_node.h" -#include "bvh_params.h" +#include "bvh/bvh_binning.h" +#include "bvh/bvh_build.h" +#include "bvh/bvh_node.h" +#include "bvh/bvh_params.h" #include "bvh_split.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "curves.h" - -#include "util_algorithm.h" -#include "util_debug.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_progress.h" -#include "util_stack_allocator.h" -#include "util_simd.h" -#include "util_time.h" -#include "util_queue.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/curves.h" + +#include "util/util_algorithm.h" +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_progress.h" +#include "util/util_stack_allocator.h" +#include "util/util_simd.h" +#include "util/util_time.h" +#include "util/util_queue.h" CCL_NAMESPACE_BEGIN @@ -93,12 +93,14 @@ BVHBuild::BVHBuild(const vector<Object*>& objects_, array<int>& prim_type_, array<int>& prim_index_, array<int>& prim_object_, + array<float2>& prim_time_, const BVHParams& params_, Progress& progress_) : objects(objects_), prim_type(prim_type_), prim_index(prim_index_), prim_object(prim_object_), + prim_time(prim_time_), params(params_), progress(progress_), progress_start_time(0.0), @@ -465,6 +467,9 @@ BVHNode* BVHBuild::run() } spatial_free_index = 0; + need_prim_time = params.num_motion_curve_steps > 0 || + params.num_motion_triangle_steps > 0; + /* init progress updates */ double build_start_time; build_start_time = progress_start_time = time_dt(); @@ -475,6 +480,12 @@ BVHNode* BVHBuild::run() prim_type.resize(references.size()); prim_index.resize(references.size()); prim_object.resize(references.size()); + if(need_prim_time) { + prim_time.resize(references.size()); + } + else { + prim_time.resize(0); + } /* build recursively */ BVHNode *rootnode; @@ -849,11 +860,14 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start, prim_type[start] = ref->prim_type(); prim_index[start] = ref->prim_index(); prim_object[start] = ref->prim_object(); + if(need_prim_time) { + prim_time[start] = make_float2(ref->time_from(), ref->time_to()); + } uint visibility = objects[ref->prim_object()]->visibility; BVHNode *leaf_node = new LeafNode(ref->bounds(), visibility, start, start+1); - leaf_node->m_time_from = ref->time_from(); - leaf_node->m_time_to = ref->time_to(); + leaf_node->time_from = ref->time_from(); + leaf_node->time_to = ref->time_to(); return leaf_node; } else { @@ -862,12 +876,12 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start, BVHNode *leaf1 = create_object_leaf_nodes(ref+mid, start+mid, num-mid); BoundBox bounds = BoundBox::empty; - bounds.grow(leaf0->m_bounds); - bounds.grow(leaf1->m_bounds); + bounds.grow(leaf0->bounds); + bounds.grow(leaf1->bounds); BVHNode *inner_node = new InnerNode(bounds, leaf0, leaf1); - inner_node->m_time_from = min(leaf0->m_time_from, leaf1->m_time_from); - inner_node->m_time_to = max(leaf0->m_time_to, leaf1->m_time_to); + inner_node->time_from = min(leaf0->time_from, leaf1->time_from); + inner_node->time_to = max(leaf0->time_to, leaf1->time_to); return inner_node; } } @@ -891,11 +905,13 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, * can not control. */ typedef StackAllocator<256, int> LeafStackAllocator; + typedef StackAllocator<256, float2> LeafTimeStackAllocator; typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator; vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL]; vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL]; vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL]; + vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM_TOTAL]; vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL]; /* TODO(sergey): In theory we should be able to store references. */ @@ -918,6 +934,8 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, p_type[type_index].push_back(ref.prim_type()); p_index[type_index].push_back(ref.prim_index()); p_object[type_index].push_back(ref.prim_object()); + p_time[type_index].push_back(make_float2(ref.time_from(), + ref.time_to())); bounds[type_index].grow(ref.bounds()); visibility[type_index] |= objects[ref.prim_object()]->visibility; @@ -947,9 +965,13 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, vector<int, LeafStackAllocator> local_prim_type, local_prim_index, local_prim_object; + vector<float2, LeafTimeStackAllocator> local_prim_time; local_prim_type.resize(num_new_prims); local_prim_index.resize(num_new_prims); local_prim_object.resize(num_new_prims); + if(need_prim_time) { + local_prim_time.resize(num_new_prims); + } for(int i = 0; i < PRIMITIVE_NUM_TOTAL; ++i) { int num = (int)p_type[i].size(); if(num != 0) { @@ -962,6 +984,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, local_prim_type[index] = p_type[i][j]; local_prim_index[index] = p_index[i][j]; local_prim_object[index] = p_object[i][j]; + if(need_prim_time) { + local_prim_time[index] = p_time[i][j]; + } if(params.use_unaligned_nodes && !alignment_found) { alignment_found = unaligned_heuristic.compute_aligned_space(p_ref[i][j], @@ -979,19 +1004,19 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, time_from = min(time_from, ref.time_from()); time_to = max(time_to, ref.time_to()); } - leaf_node->m_time_from = time_from; - leaf_node->m_time_to = time_to; + leaf_node->time_from = time_from; + leaf_node->time_to = time_to; } if(alignment_found) { /* Need to recalculate leaf bounds with new alignment. */ - leaf_node->m_bounds = BoundBox::empty; + leaf_node->bounds = BoundBox::empty; for(int j = 0; j < num; ++j) { const BVHReference &ref = p_ref[i][j]; BoundBox ref_bounds = unaligned_heuristic.compute_aligned_prim_boundbox( ref, aligned_space); - leaf_node->m_bounds.grow(ref_bounds); + leaf_node->bounds.grow(ref_bounds); } /* Set alignment space. */ leaf_node->set_aligned_space(aligned_space); @@ -1028,11 +1053,17 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, prim_type.reserve(reserve); prim_index.reserve(reserve); prim_object.reserve(reserve); + if(need_prim_time) { + prim_time.reserve(reserve); + } } prim_type.resize(range_end); prim_index.resize(range_end); prim_object.resize(range_end); + if(need_prim_time) { + prim_time.resize(range_end); + } } spatial_spin_lock.unlock(); @@ -1041,6 +1072,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size); memcpy(&prim_index[start_index], &local_prim_index[0], new_leaf_data_size); memcpy(&prim_object[start_index], &local_prim_object[0], new_leaf_data_size); + if(need_prim_time) { + memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data); + } } } else { @@ -1053,6 +1087,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size); memcpy(&prim_index[start_index], &local_prim_index[0], new_leaf_data_size); memcpy(&prim_object[start_index], &local_prim_object[0], new_leaf_data_size); + if(need_prim_time) { + memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data); + } } } @@ -1062,8 +1099,8 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, */ for(int i = 0; i < num_leaves; ++i) { LeafNode *leaf = (LeafNode *)leaves[i]; - leaf->m_lo += start_index; - leaf->m_hi += start_index; + leaf->lo += start_index; + leaf->hi += start_index; } /* Create leaf node for object. */ @@ -1092,17 +1129,17 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, return new InnerNode(range.bounds(), leaves[0], leaves[1]); } else if(num_leaves == 3) { - BoundBox inner_bounds = merge(leaves[1]->m_bounds, leaves[2]->m_bounds); + BoundBox inner_bounds = merge(leaves[1]->bounds, leaves[2]->bounds); BVHNode *inner = new InnerNode(inner_bounds, leaves[1], leaves[2]); return new InnerNode(range.bounds(), leaves[0], inner); } else { /* Should be doing more branches if more primitive types added. */ assert(num_leaves <= 5); - BoundBox inner_bounds_a = merge(leaves[0]->m_bounds, leaves[1]->m_bounds); - BoundBox inner_bounds_b = merge(leaves[2]->m_bounds, leaves[3]->m_bounds); + BoundBox inner_bounds_a = merge(leaves[0]->bounds, leaves[1]->bounds); + BoundBox inner_bounds_b = merge(leaves[2]->bounds, leaves[3]->bounds); BVHNode *inner_a = new InnerNode(inner_bounds_a, leaves[0], leaves[1]); BVHNode *inner_b = new InnerNode(inner_bounds_b, leaves[2], leaves[3]); - BoundBox inner_bounds_c = merge(inner_a->m_bounds, inner_b->m_bounds); + BoundBox inner_bounds_c = merge(inner_a->bounds, inner_b->bounds); BVHNode *inner_c = new InnerNode(inner_bounds_c, inner_a, inner_b); if(num_leaves == 5) { return new InnerNode(range.bounds(), inner_c, leaves[4]); @@ -1137,8 +1174,8 @@ void BVHBuild::rotate(BVHNode *node, int max_depth) rotate(parent->children[c], max_depth-1); /* compute current area of all children */ - BoundBox bounds0 = parent->children[0]->m_bounds; - BoundBox bounds1 = parent->children[1]->m_bounds; + BoundBox bounds0 = parent->children[0]->bounds; + BoundBox bounds1 = parent->children[1]->bounds; float area0 = bounds0.half_area(); float area1 = bounds1.half_area(); @@ -1158,8 +1195,8 @@ void BVHBuild::rotate(BVHNode *node, int max_depth) BoundBox& other = (c == 0)? bounds1: bounds0; /* transpose child bounds */ - BoundBox target0 = child->children[0]->m_bounds; - BoundBox target1 = child->children[1]->m_bounds; + BoundBox target0 = child->children[0]->bounds; + BoundBox target1 = child->children[1]->bounds; /* compute cost for both possible swaps */ float cost0 = merge(other, target1).half_area() - child_area[c]; @@ -1191,7 +1228,7 @@ void BVHBuild::rotate(BVHNode *node, int max_depth) InnerNode *child = (InnerNode*)parent->children[best_child]; swap(parent->children[best_other], child->children[best_target]); - child->m_bounds = merge(child->children[0]->m_bounds, child->children[1]->m_bounds); + child->bounds = merge(child->children[0]->bounds, child->children[1]->bounds); } CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h index ee3cde66a2f..5733708050d 100644 --- a/intern/cycles/bvh/bvh_build.h +++ b/intern/cycles/bvh/bvh_build.h @@ -20,13 +20,13 @@ #include <float.h> -#include "bvh.h" -#include "bvh_binning.h" -#include "bvh_unaligned.h" +#include "bvh/bvh.h" +#include "bvh/bvh_binning.h" +#include "bvh/bvh_unaligned.h" -#include "util_boundbox.h" -#include "util_task.h" -#include "util_vector.h" +#include "util/util_boundbox.h" +#include "util/util_task.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -48,6 +48,7 @@ public: array<int>& prim_type, array<int>& prim_index, array<int>& prim_object, + array<float2>& prim_time, const BVHParams& params, Progress& progress); ~BVHBuild(); @@ -112,6 +113,9 @@ protected: array<int>& prim_type; array<int>& prim_index; array<int>& prim_object; + array<float2>& prim_time; + + bool need_prim_time; /* Build parameters. */ BVHParams params; diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp index 67580e1bc7b..4f788c66797 100644 --- a/intern/cycles/bvh/bvh_node.cpp +++ b/intern/cycles/bvh/bvh_node.cpp @@ -15,12 +15,12 @@ * limitations under the License. */ -#include "bvh.h" -#include "bvh_build.h" -#include "bvh_node.h" +#include "bvh/bvh.h" +#include "bvh/bvh_build.h" +#include "bvh/bvh_node.h" -#include "util_debug.h" -#include "util_vector.h" +#include "util/util_debug.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -62,12 +62,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const } return cnt; case BVH_STAT_ALIGNED_COUNT: - if(!is_unaligned()) { + if(!is_unaligned) { cnt = 1; } break; case BVH_STAT_UNALIGNED_COUNT: - if(is_unaligned()) { + if(is_unaligned) { cnt = 1; } break; @@ -75,7 +75,7 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const if(!is_leaf()) { bool has_unaligned = false; for(int j = 0; j < num_children(); j++) { - has_unaligned |= get_child(j)->is_unaligned(); + has_unaligned |= get_child(j)->is_unaligned; } cnt += has_unaligned? 0: 1; } @@ -84,7 +84,7 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const if(!is_leaf()) { bool has_unaligned = false; for(int j = 0; j < num_children(); j++) { - has_unaligned |= get_child(j)->is_unaligned(); + has_unaligned |= get_child(j)->is_unaligned; } cnt += has_unaligned? 1: 0; } @@ -95,12 +95,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const for(int i = 0; i < num_children(); i++) { BVHNode *node = get_child(i); if(node->is_leaf()) { - has_unaligned |= node->is_unaligned(); + has_unaligned |= node->is_unaligned; } else { for(int j = 0; j < node->num_children(); j++) { cnt += node->get_child(j)->getSubtreeSize(stat); - has_unaligned |= node->get_child(j)->is_unaligned(); + has_unaligned |= node->get_child(j)->is_unaligned; } } } @@ -113,12 +113,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const for(int i = 0; i < num_children(); i++) { BVHNode *node = get_child(i); if(node->is_leaf()) { - has_unaligned |= node->is_unaligned(); + has_unaligned |= node->is_unaligned; } else { for(int j = 0; j < node->num_children(); j++) { cnt += node->get_child(j)->getSubtreeSize(stat); - has_unaligned |= node->get_child(j)->is_unaligned(); + has_unaligned |= node->get_child(j)->is_unaligned; } } } @@ -126,10 +126,10 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const } return cnt; case BVH_STAT_ALIGNED_LEAF_COUNT: - cnt = (is_leaf() && !is_unaligned()) ? 1 : 0; + cnt = (is_leaf() && !is_unaligned) ? 1 : 0; break; case BVH_STAT_UNALIGNED_LEAF_COUNT: - cnt = (is_leaf() && is_unaligned()) ? 1 : 0; + cnt = (is_leaf() && is_unaligned) ? 1 : 0; break; default: assert(0); /* unknown mode */ @@ -157,7 +157,7 @@ float BVHNode::computeSubtreeSAHCost(const BVHParams& p, float probability) cons for(int i = 0; i < num_children(); i++) { BVHNode *child = get_child(i); - SAH += child->computeSubtreeSAHCost(p, probability * child->m_bounds.safe_area()/m_bounds.safe_area()); + SAH += child->computeSubtreeSAHCost(p, probability * child->bounds.safe_area()/bounds.safe_area()); } return SAH; @@ -165,15 +165,15 @@ float BVHNode::computeSubtreeSAHCost(const BVHParams& p, float probability) cons uint BVHNode::update_visibility() { - if(!is_leaf() && m_visibility == 0) { + if(!is_leaf() && visibility == 0) { InnerNode *inner = (InnerNode*)this; BVHNode *child0 = inner->children[0]; BVHNode *child1 = inner->children[1]; - m_visibility = child0->update_visibility()|child1->update_visibility(); + visibility = child0->update_visibility()|child1->update_visibility(); } - return m_visibility; + return visibility; } void BVHNode::update_time() @@ -184,8 +184,8 @@ void BVHNode::update_time() BVHNode *child1 = inner->children[1]; child0->update_time(); child1->update_time(); - m_time_from = min(child0->m_time_from, child1->m_time_from); - m_time_to = max(child0->m_time_to, child1->m_time_to); + time_from = min(child0->time_from, child1->time_from); + time_to = max(child0->time_to, child1->time_to); } } @@ -209,7 +209,7 @@ void LeafNode::print(int depth) const for(int i = 0; i < depth; i++) printf(" "); - printf("leaf node %d to %d\n", m_lo, m_hi); + printf("leaf node %d to %d\n", lo, hi); } CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h index 090c426de56..60511b4b012 100644 --- a/intern/cycles/bvh/bvh_node.h +++ b/intern/cycles/bvh/bvh_node.h @@ -18,9 +18,9 @@ #ifndef __BVH_NODE_H__ #define __BVH_NODE_H__ -#include "util_boundbox.h" -#include "util_debug.h" -#include "util_types.h" +#include "util/util_boundbox.h" +#include "util/util_debug.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -46,16 +46,16 @@ class BVHParams; class BVHNode { public: - BVHNode() : m_is_unaligned(false), - m_aligned_space(NULL), - m_time_from(0.0f), - m_time_to(1.0f) + BVHNode() : is_unaligned(false), + aligned_space(NULL), + time_from(0.0f), + time_to(1.0f) { } virtual ~BVHNode() { - delete m_aligned_space; + delete aligned_space; } virtual bool is_leaf() const = 0; @@ -63,30 +63,26 @@ public: virtual BVHNode *get_child(int i) const = 0; virtual int num_triangles() const { return 0; } virtual void print(int depth = 0) const = 0; - bool is_unaligned() const { return m_is_unaligned; } inline void set_aligned_space(const Transform& aligned_space) { - m_is_unaligned = true; - if(m_aligned_space == NULL) { - m_aligned_space = new Transform(aligned_space); + is_unaligned = true; + if(this->aligned_space == NULL) { + this->aligned_space = new Transform(aligned_space); } else { - *m_aligned_space = aligned_space; + *this->aligned_space = aligned_space; } } inline Transform get_aligned_space() const { - if(m_aligned_space == NULL) { + if(aligned_space == NULL) { return transform_identity(); } - return *m_aligned_space; + return *aligned_space; } - BoundBox m_bounds; - uint m_visibility; - // Subtree functions int getSubtreeSize(BVH_STAT stat=BVH_STAT_NODE_COUNT) const; float computeSubtreeSAHCost(const BVHParams& p, float probability = 1.0f) const; @@ -95,13 +91,18 @@ public: uint update_visibility(); void update_time(); - bool m_is_unaligned; + // Properties. + BoundBox bounds; + uint visibility; + + bool is_unaligned; - // TODO(sergey): Can be stored as 3x3 matrix, but better to have some - // utilities and type defines in util_transform first. - Transform *m_aligned_space; + /* TODO(sergey): Can be stored as 3x3 matrix, but better to have some + * utilities and type defines in util_transform first. + */ + Transform *aligned_space; - float m_time_from, m_time_to; + float time_from, time_to; }; class InnerNode : public BVHNode @@ -111,20 +112,20 @@ public: BVHNode* child0, BVHNode* child1) { - m_bounds = bounds; + this->bounds = bounds; children[0] = child0; children[1] = child1; if(child0 && child1) - m_visibility = child0->m_visibility|child1->m_visibility; + visibility = child0->visibility|child1->visibility; else - m_visibility = 0; /* happens on build cancel */ + visibility = 0; /* happens on build cancel */ } explicit InnerNode(const BoundBox& bounds) { - m_bounds = bounds; - m_visibility = 0; + this->bounds = bounds; + visibility = 0; children[0] = NULL; children[1] = NULL; } @@ -140,12 +141,12 @@ public: class LeafNode : public BVHNode { public: - LeafNode(const BoundBox& bounds, uint visibility, int lo, int hi) + LeafNode(const BoundBox& bounds, uint visibility, int lo, int hi) + : lo(lo), + hi(hi) { - m_bounds = bounds; - m_visibility = visibility; - m_lo = lo; - m_hi = hi; + this->bounds = bounds; + this->visibility = visibility; } LeafNode(const LeafNode& s) @@ -157,14 +158,13 @@ public: bool is_leaf() const { return true; } int num_children() const { return 0; } BVHNode *get_child(int) const { return NULL; } - int num_triangles() const { return m_hi - m_lo; } + int num_triangles() const { return hi - lo; } void print(int depth) const; - int m_lo; - int m_hi; + int lo; + int hi; }; CCL_NAMESPACE_END #endif /* __BVH_NODE_H__ */ - diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h index 65f9da1c194..9795a7a4350 100644 --- a/intern/cycles/bvh/bvh_params.h +++ b/intern/cycles/bvh/bvh_params.h @@ -18,9 +18,9 @@ #ifndef __BVH_PARAMS_H__ #define __BVH_PARAMS_H__ -#include "util_boundbox.h" +#include "util/util_boundbox.h" -#include "kernel_types.h" +#include "kernel/kernel_types.h" CCL_NAMESPACE_BEGIN @@ -104,6 +104,7 @@ public: primitive_mask = PRIMITIVE_ALL; num_motion_curve_steps = 0; + num_motion_triangle_steps = 0; } /* SAH costs */ diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp index e5bcf9995bf..d29629c0279 100644 --- a/intern/cycles/bvh/bvh_sort.cpp +++ b/intern/cycles/bvh/bvh_sort.cpp @@ -15,12 +15,12 @@ * limitations under the License. */ -#include "bvh_build.h" -#include "bvh_sort.h" +#include "bvh/bvh_build.h" +#include "bvh/bvh_sort.h" -#include "util_algorithm.h" -#include "util_debug.h" -#include "util_task.h" +#include "util/util_algorithm.h" +#include "util/util_debug.h" +#include "util/util_task.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp index d0d5fbe5a7a..b10d69a495d 100644 --- a/intern/cycles/bvh/bvh_split.cpp +++ b/intern/cycles/bvh/bvh_split.cpp @@ -15,14 +15,14 @@ * limitations under the License. */ -#include "bvh_build.h" -#include "bvh_split.h" -#include "bvh_sort.h" +#include "bvh/bvh_build.h" +#include "bvh/bvh_split.h" +#include "bvh/bvh_sort.h" -#include "mesh.h" -#include "object.h" +#include "render/mesh.h" +#include "render/object.h" -#include "util_algorithm.h" +#include "util/util_algorithm.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/bvh/bvh_split.h b/intern/cycles/bvh/bvh_split.h index dbdb51f1a5b..a874a118b99 100644 --- a/intern/cycles/bvh/bvh_split.h +++ b/intern/cycles/bvh/bvh_split.h @@ -18,8 +18,8 @@ #ifndef __BVH_SPLIT_H__ #define __BVH_SPLIT_H__ -#include "bvh_build.h" -#include "bvh_params.h" +#include "bvh/bvh_build.h" +#include "bvh/bvh_params.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/bvh/bvh_unaligned.cpp b/intern/cycles/bvh/bvh_unaligned.cpp index a876c670914..ef227d20ea9 100644 --- a/intern/cycles/bvh/bvh_unaligned.cpp +++ b/intern/cycles/bvh/bvh_unaligned.cpp @@ -15,17 +15,17 @@ */ -#include "bvh_unaligned.h" +#include "bvh/bvh_unaligned.h" -#include "mesh.h" -#include "object.h" +#include "render/mesh.h" +#include "render/object.h" -#include "bvh_binning.h" +#include "bvh/bvh_binning.h" #include "bvh_params.h" -#include "util_boundbox.h" -#include "util_debug.h" -#include "util_transform.h" +#include "util/util_boundbox.h" +#include "util/util_debug.h" +#include "util/util_transform.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/bvh/bvh_unaligned.h b/intern/cycles/bvh/bvh_unaligned.h index 4d0872f4a39..f41bae79e2b 100644 --- a/intern/cycles/bvh/bvh_unaligned.h +++ b/intern/cycles/bvh/bvh_unaligned.h @@ -17,7 +17,7 @@ #ifndef __BVH_UNALIGNED_H__ #define __BVH_UNALIGNED_H__ -#include "util_vector.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 966ff5e52ba..6ef2aa1caad 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -1,12 +1,6 @@ set(INC - . - ../graph - ../kernel - ../kernel/svm - ../kernel/osl - ../util - ../render + .. ../../glew-mx ) @@ -33,6 +27,7 @@ set(SRC device_cuda.cpp device_multi.cpp device_opencl.cpp + device_split_kernel.cpp device_task.cpp ) @@ -56,6 +51,7 @@ set(SRC_HEADERS device_memory.h device_intern.h device_network.h + device_split_kernel.h device_task.h ) diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 31c99f49d6d..968af447e29 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -17,18 +17,18 @@ #include <stdlib.h> #include <string.h> -#include "device.h" -#include "device_intern.h" - -#include "util_debug.h" -#include "util_foreach.h" -#include "util_half.h" -#include "util_math.h" -#include "util_opengl.h" -#include "util_time.h" -#include "util_types.h" -#include "util_vector.h" -#include "util_string.h" +#include "device/device.h" +#include "device/device_intern.h" + +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_half.h" +#include "util/util_math.h" +#include "util/util_opengl.h" +#include "util/util_time.h" +#include "util/util_types.h" +#include "util/util_vector.h" +#include "util/util_string.h" CCL_NAMESPACE_BEGIN @@ -48,11 +48,11 @@ std::ostream& operator <<(std::ostream &os, os << "Max nodes group: " << requested_features.max_nodes_group << std::endl; /* TODO(sergey): Decode bitflag into list of names. */ os << "Nodes features: " << requested_features.nodes_features << std::endl; - os << "Use hair: " + os << "Use Hair: " << string_from_bool(requested_features.use_hair) << std::endl; - os << "Use object motion: " + os << "Use Object Motion: " << string_from_bool(requested_features.use_object_motion) << std::endl; - os << "Use camera motion: " + os << "Use Camera Motion: " << string_from_bool(requested_features.use_camera_motion) << std::endl; os << "Use Baking: " << string_from_bool(requested_features.use_baking) << std::endl; @@ -80,7 +80,7 @@ Device::~Device() void Device::pixels_alloc(device_memory& mem) { - mem_alloc(mem, MEM_READ_WRITE); + mem_alloc("pixels", mem, MEM_READ_WRITE); } void Device::pixels_copy_from(device_memory& mem, int y, int w, int h) diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index ccee25ae34e..ac06e561795 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -19,15 +19,15 @@ #include <stdlib.h> -#include "device_memory.h" -#include "device_task.h" +#include "device/device_memory.h" +#include "device/device_task.h" -#include "util_list.h" -#include "util_stats.h" -#include "util_string.h" -#include "util_thread.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_list.h" +#include "util/util_stats.h" +#include "util/util_string.h" +#include "util/util_thread.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -121,6 +121,9 @@ public: /* Use Transparent shadows */ bool use_transparent; + /* Use various shadow tricks, such as shadow catcher. */ + bool use_shadow_tricks; + DeviceRequestedFeatures() { /* TODO(sergey): Find more meaningful defaults. */ @@ -137,6 +140,7 @@ public: use_integrator_branched = false; use_patch_evaluation = false; use_transparent = false; + use_shadow_tricks = false; } bool modified(const DeviceRequestedFeatures& requested_features) @@ -153,7 +157,8 @@ public: use_volume == requested_features.use_volume && use_integrator_branched == requested_features.use_integrator_branched && use_patch_evaluation == requested_features.use_patch_evaluation && - use_transparent == requested_features.use_transparent); + use_transparent == requested_features.use_transparent && + use_shadow_tricks == requested_features.use_shadow_tricks); } /* Convert the requested features structure to a build options, @@ -194,9 +199,12 @@ public: if(!use_patch_evaluation) { build_options += " -D__NO_PATCH_EVAL__"; } - if(!use_transparent) { + if(!use_transparent && !use_volume) { build_options += " -D__NO_TRANSPARENT__"; } + if(!use_shadow_tricks) { + build_options += " -D__NO_SHADOW_TRICKS__"; + } return build_options; } }; @@ -228,13 +236,21 @@ public: DeviceInfo info; virtual const string& error_message() { return error_msg; } bool have_error() { return !error_message().empty(); } + virtual void set_error(const string& error) + { + if(!have_error()) { + error_msg = error; + } + fprintf(stderr, "%s\n", error.c_str()); + fflush(stderr); + } virtual bool show_samples() const { return false; } /* statistics */ Stats &stats; /* regular memory */ - virtual void mem_alloc(device_memory& mem, MemoryType type) = 0; + virtual void mem_alloc(const char *name, device_memory& mem, MemoryType type) = 0; virtual void mem_copy_to(device_memory& mem) = 0; virtual void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) = 0; diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index c8e001ec2fd..2761d9488ca 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -20,36 +20,124 @@ /* So ImathMath is included before our kernel_cpu_compat. */ #ifdef WITH_OSL /* So no context pollution happens from indirectly included windows.h */ -# include "util_windows.h" +# include "util/util_windows.h" # include <OSL/oslexec.h> #endif -#include "device.h" -#include "device_intern.h" +#include "device/device.h" +#include "device/device_intern.h" +#include "device/device_split_kernel.h" -#include "kernel.h" -#include "kernel_compat_cpu.h" -#include "kernel_types.h" -#include "kernel_globals.h" +#include "kernel/kernel.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/kernel_types.h" +#include "kernel/split/kernel_split_data.h" +#include "kernel/kernel_globals.h" -#include "osl_shader.h" -#include "osl_globals.h" +#include "kernel/osl/osl_shader.h" +#include "kernel/osl/osl_globals.h" -#include "buffers.h" +#include "render/buffers.h" -#include "util_debug.h" -#include "util_foreach.h" -#include "util_function.h" -#include "util_logging.h" -#include "util_opengl.h" -#include "util_progress.h" -#include "util_system.h" -#include "util_thread.h" +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_function.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_opengl.h" +#include "util/util_progress.h" +#include "util/util_system.h" +#include "util/util_thread.h" CCL_NAMESPACE_BEGIN +class CPUDevice; + +class CPUSplitKernel : public DeviceSplitKernel { + CPUDevice *device; +public: + explicit CPUSplitKernel(CPUDevice *device); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs); + + virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); + virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); +}; + class CPUDevice : public Device { + static unordered_map<string, void*> kernel_functions; + + static void register_kernel_function(const char* name, void* func) + { + kernel_functions[name] = func; + } + + static const char* get_arch_name() + { +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + return "cpu_avx2"; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) { + return "cpu_avx"; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if(system_cpu_support_sse41()) { + return "cpu_sse41"; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 + if(system_cpu_support_sse3()) { + return "cpu_sse3"; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + if(system_cpu_support_sse2()) { + return "cpu_sse2"; + } + else +#endif + { + return "cpu"; + } + } + + template<typename F> + static F get_kernel_function(string name) + { + name = string("kernel_") + get_arch_name() + "_" + name; + + unordered_map<string, void*>::iterator it = kernel_functions.find(name); + + if(it == kernel_functions.end()) { + assert(!"kernel function not found"); + return NULL; + } + + return (F)it->second; + } + + friend class CPUSplitKernel; + public: TaskPool task_pool; KernelGlobals kernel_globals; @@ -57,10 +145,15 @@ public: #ifdef WITH_OSL OSLGlobals osl_globals; #endif + + bool use_split_kernel; + + DeviceRequestedFeatures requested_features; CPUDevice(DeviceInfo& info, Stats &stats, bool background) : Device(info, stats, background) { + #ifdef WITH_OSL kernel_globals.osl = &osl_globals; #endif @@ -105,6 +198,28 @@ public: { VLOG(1) << "Will be using regular kernels."; } + + use_split_kernel = DebugFlags().cpu.split_kernel; + if(use_split_kernel) { + VLOG(1) << "Will be using split kernel."; + } + + kernel_cpu_register_functions(register_kernel_function); +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + kernel_cpu_sse2_register_functions(register_kernel_function); +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 + kernel_cpu_sse3_register_functions(register_kernel_function); +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + kernel_cpu_sse41_register_functions(register_kernel_function); +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + kernel_cpu_avx_register_functions(register_kernel_function); +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + kernel_cpu_avx2_register_functions(register_kernel_function); +#endif } ~CPUDevice() @@ -117,9 +232,20 @@ public: return (TaskScheduler::num_threads() == 1); } - void mem_alloc(device_memory& mem, MemoryType /*type*/) + void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/) { + if(name) { + VLOG(1) << "Buffer allocate: " << name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + mem.device_pointer = mem.data_pointer; + + if(!mem.device_pointer) { + mem.device_pointer = (device_ptr)malloc(mem.memory_size()); + } + mem.device_size = mem.memory_size(); stats.mem_alloc(mem.device_size); } @@ -144,6 +270,10 @@ public: void mem_free(device_memory& mem) { if(mem.device_pointer) { + if(!mem.data_pointer) { + free((void*)mem.device_pointer); + } + mem.device_pointer = 0; stats.mem_free(mem.device_size); mem.device_size = 0; @@ -196,8 +326,14 @@ public: void thread_run(DeviceTask *task) { - if(task->type == DeviceTask::PATH_TRACE) - thread_path_trace(*task); + if(task->type == DeviceTask::PATH_TRACE) { + if(!use_split_kernel) { + thread_path_trace(*task); + } + else { + thread_path_trace_split(*task); + } + } else if(task->type == DeviceTask::FILM_CONVERT) thread_film_convert(*task); else if(task->type == DeviceTask::SHADER) @@ -258,7 +394,7 @@ public: { path_trace_kernel = kernel_cpu_path_trace; } - + while(task.acquire_tile(this, tile)) { float *render_buffer = (float*)tile.buffer; uint *rng_state = (uint*)tile.rng_state; @@ -294,6 +430,49 @@ public: thread_kernel_globals_free(&kg); } + void thread_path_trace_split(DeviceTask& task) + { + if(task_pool.canceled()) { + if(task.need_finish_queue == false) + return; + } + + RenderTile tile; + + CPUSplitKernel split_kernel(this); + + /* allocate buffer for kernel globals */ + device_memory kgbuffer; + kgbuffer.resize(sizeof(KernelGlobals)); + mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE); + + KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer; + *kg = thread_kernel_globals_init(); + + requested_features.max_closure = MAX_CLOSURE; + if(!split_kernel.load_kernels(requested_features)) { + thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); + mem_free(kgbuffer); + + return; + } + + while(task.acquire_tile(this, tile)) { + device_memory data; + split_kernel.path_trace(&task, tile, kgbuffer, data); + + task.release_tile(tile); + + if(task_pool.canceled()) { + if(task.need_finish_queue == false) + break; + } + } + + thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); + mem_free(kgbuffer); + } + void thread_film_convert(DeviceTask& task) { float sample_scale = 1.0f/(task.sample + 1); @@ -501,6 +680,10 @@ protected: inline void thread_kernel_globals_free(KernelGlobals *kg) { + if(kg == NULL) { + return; + } + if(kg->transparent_shadow_intersections != NULL) { free(kg->transparent_shadow_intersections); } @@ -515,8 +698,175 @@ protected: OSLShader::thread_free(kg); #endif } + + virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) { + requested_features = requested_features_; + + return true; + } +}; + +/* split kernel */ + +class CPUSplitKernelFunction : public SplitKernelFunction { +public: + CPUDevice* device; + void (*func)(KernelGlobals *kg, KernelData *data); + + CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {} + ~CPUSplitKernelFunction() {} + + virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data) + { + if(!func) { + return false; + } + + KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; + kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); + + for(int y = 0; y < dim.global_size[1]; y++) { + for(int x = 0; x < dim.global_size[0]; x++) { + kg->global_id = make_int2(x, y); + + func(kg, (KernelData*)data.device_pointer); + } + } + + return true; + } }; +CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device) +{ +} + +bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& data, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flags, + device_memory& work_pool_wgs) +{ + typedef void(*data_init_t)(KernelGlobals *kg, + ccl_constant KernelData *data, + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + ccl_global uint *rng_state, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, + ccl_global int *Queue_index, + int queuesize, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, + ccl_global float *buffer); + + data_init_t data_init; + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + data_init = kernel_cpu_avx2_data_init; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) { + data_init = kernel_cpu_avx_data_init; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if(system_cpu_support_sse41()) { + data_init = kernel_cpu_sse41_data_init; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 + if(system_cpu_support_sse3()) { + data_init = kernel_cpu_sse3_data_init; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + if(system_cpu_support_sse2()) { + data_init = kernel_cpu_sse2_data_init; + } + else +#endif + { + data_init = kernel_cpu_data_init; + } + + KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; + kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); + + for(int y = 0; y < dim.global_size[1]; y++) { + for(int x = 0; x < dim.global_size[0]; x++) { + kg->global_id = make_int2(x, y); + + data_init((KernelGlobals*)kernel_globals.device_pointer, + (KernelData*)data.device_pointer, + (void*)split_data.device_pointer, + num_global_elements, + (char*)ray_state.device_pointer, + (uint*)rtile.rng_state, + rtile.start_sample, + rtile.start_sample + rtile.num_samples, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + rtile.offset, + rtile.stride, + (int*)queue_index.device_pointer, + dim.global_size[0] * dim.global_size[1], + (char*)use_queues_flags.device_pointer, + (uint*)work_pool_wgs.device_pointer, + rtile.num_samples, + (float*)rtile.buffer); + } + } + + return true; +} + +SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) +{ + CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device); + + kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name); + if(!kernel->func) { + delete kernel; + return NULL; + } + + return kernel; +} + +int2 CPUSplitKernel::split_kernel_local_size() +{ + return make_int2(1, 1); +} + +int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) { + return make_int2(64, 1); +} + +uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) { + KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; + + return split_data_buffer_size(kg, num_threads); +} + +unordered_map<string, void*> CPUDevice::kernel_functions; + Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background) { return new CPUDevice(info, stats, background); diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index dafac6dfcb3..606494f08ed 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -15,32 +15,36 @@ */ #include <climits> +#include <limits.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include "device.h" -#include "device_intern.h" +#include "device/device.h" +#include "device/device_intern.h" +#include "device/device_split_kernel.h" -#include "buffers.h" +#include "render/buffers.h" #ifdef WITH_CUDA_DYNLOAD # include "cuew.h" #else -# include "util_opengl.h" +# include "util/util_opengl.h" # include <cuda.h> # include <cudaGL.h> #endif -#include "util_debug.h" -#include "util_logging.h" -#include "util_map.h" -#include "util_md5.h" -#include "util_opengl.h" -#include "util_path.h" -#include "util_string.h" -#include "util_system.h" -#include "util_types.h" -#include "util_time.h" +#include "util/util_debug.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_md5.h" +#include "util/util_opengl.h" +#include "util/util_path.h" +#include "util/util_string.h" +#include "util/util_system.h" +#include "util/util_types.h" +#include "util/util_time.h" + +#include "kernel/split/kernel_split_data_types.h" CCL_NAMESPACE_BEGIN @@ -78,6 +82,31 @@ int cuewCompilerVersion(void) } /* namespace */ #endif /* WITH_CUDA_DYNLOAD */ +class CUDADevice; + +class CUDASplitKernel : public DeviceSplitKernel { + CUDADevice *device; +public: + explicit CUDASplitKernel(CUDADevice *device); + + virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs); + + virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); +}; + class CUDADevice : public Device { public: @@ -258,16 +287,21 @@ public: return DebugFlags().cuda.adaptive_compile; } + bool use_split_kernel() + { + return DebugFlags().cuda.split_kernel; + } + /* Common NVCC flags which stays the same regardless of shading model, * kernel sources md5 and only depends on compiler or compilation settings. */ string compile_kernel_get_common_cflags( - const DeviceRequestedFeatures& requested_features) + const DeviceRequestedFeatures& requested_features, bool split=false) { const int cuda_version = cuewCompilerVersion(); const int machine = system_cpu_bits(); - const string kernel_path = path_get("kernel"); - const string include = kernel_path; + const string source_path = path_get("source"); + const string include_path = source_path; string cflags = string_printf("-m%d " "--ptxas-options=\"-v\" " "--use_fast_math " @@ -276,7 +310,7 @@ public: "-I\"%s\"", machine, cuda_version, - include.c_str()); + include_path.c_str()); if(use_adaptive_compilation()) { cflags += " " + requested_features.get_build_options(); } @@ -287,6 +321,11 @@ public: #ifdef WITH_CYCLES_DEBUG cflags += " -D__KERNEL_DEBUG__"; #endif + + if(split) { + cflags += " -D__SPLIT__"; + } + return cflags; } @@ -306,21 +345,21 @@ public: cuda_error_message("CUDA nvcc compiler version could not be parsed."); return false; } - if(cuda_version < 75) { + if(cuda_version < 80) { printf("Unsupported CUDA version %d.%d detected, " - "you need CUDA 7.5 or newer.\n", + "you need CUDA 8.0 or newer.\n", major, minor); return false; } - else if(cuda_version != 75 && cuda_version != 80) { + else if(cuda_version != 80) { printf("CUDA version %d.%d detected, build may succeed but only " - "CUDA 7.5 and 8.0 are officially supported.\n", + "CUDA 8.0 is officially supported.\n", major, minor); } return true; } - string compile_kernel(const DeviceRequestedFeatures& requested_features) + string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false) { /* Compute cubin name. */ int major, minor; @@ -329,7 +368,8 @@ public: /* Attempt to use kernel provided with Blender. */ if(!use_adaptive_compilation()) { - const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", + const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin" + : "lib/kernel_sm_%d%d.cubin", major, minor)); VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; if(path_exists(cubin)) { @@ -339,18 +379,19 @@ public: } const string common_cflags = - compile_kernel_get_common_cflags(requested_features); + compile_kernel_get_common_cflags(requested_features, split); /* Try to use locally compiled kernel. */ - const string kernel_path = path_get("kernel"); - const string kernel_md5 = path_files_md5_hash(kernel_path); + const string source_path = path_get("source"); + const string kernel_md5 = path_files_md5_hash(source_path); /* We include cflags into md5 so changing cuda toolkit or changing other * compiler command line arguments makes sure cubin gets re-built. */ const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags); - const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin", + const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin" + : "cycles_kernel_sm%d%d_%s.cubin", major, minor, cubin_md5.c_str()); const string cubin = path_cache_get(path_join("kernels", cubin_file)); @@ -383,9 +424,10 @@ public: return ""; } const char *nvcc = cuewCompilerPath(); - const string kernel = path_join(kernel_path, - path_join("kernels", - path_join("cuda", "kernel.cu"))); + const string kernel = path_join( + path_join(source_path, "kernel"), + path_join("kernels", + path_join("cuda", split ? "kernel_split.cu" : "kernel.cu"))); double starttime = time_dt(); printf("Compiling CUDA kernel ...\n"); @@ -433,7 +475,7 @@ public: return false; /* get kernel */ - string cubin = compile_kernel(requested_features); + string cubin = compile_kernel(requested_features, use_split_kernel()); if(cubin == "") return false; @@ -466,8 +508,14 @@ public: } } - void mem_alloc(device_memory& mem, MemoryType /*type*/) + void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/) { + if(name) { + VLOG(1) << "Buffer allocate: " << name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + cuda_push_context(); CUdeviceptr device_pointer; size_t size = mem.memory_size(); @@ -504,7 +552,9 @@ public: void mem_zero(device_memory& mem) { - memset((void*)mem.data_pointer, 0, mem.memory_size()); + if(mem.data_pointer) { + memset((void*)mem.data_pointer, 0, mem.memory_size()); + } cuda_push_context(); if(mem.device_pointer) @@ -617,7 +667,7 @@ public: /* Data Storage */ if(interpolation == INTERPOLATION_NONE) { if(has_bindless_textures) { - mem_alloc(mem, MEM_READ_ONLY); + mem_alloc(NULL, mem, MEM_READ_ONLY); mem_copy_to(mem); cuda_push_context(); @@ -641,7 +691,7 @@ public: cuda_pop_context(); } else { - mem_alloc(mem, MEM_READ_ONLY); + mem_alloc(NULL, mem, MEM_READ_ONLY); mem_copy_to(mem); cuda_push_context(); @@ -1258,25 +1308,48 @@ public: /* Upload Bindless Mapping */ load_bindless_mapping(); - /* keep rendering tiles until done */ - while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; + if(!use_split_kernel()) { + /* keep rendering tiles until done */ + while(task->acquire_tile(this, tile)) { + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; - for(int sample = start_sample; sample < end_sample; sample++) { - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } + for(int sample = start_sample; sample < end_sample; sample++) { + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } - path_trace(tile, sample, branched); + path_trace(tile, sample, branched); - tile.sample = sample + 1; + tile.sample = sample + 1; - task->update_progress(&tile, tile.w*tile.h); + task->update_progress(&tile, tile.w*tile.h); + } + + task->release_tile(tile); + } + } + else { + DeviceRequestedFeatures requested_features; + if(!use_adaptive_compilation()) { + requested_features.max_closure = 64; } - task->release_tile(tile); + CUDASplitKernel split_kernel(this); + split_kernel.load_kernels(requested_features); + + while(task->acquire_tile(this, tile)) { + device_memory void_buffer; + split_kernel.path_trace(task, tile, void_buffer, void_buffer); + + task->release_tile(tile); + + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } + } } } else if(task->type == DeviceTask::SHADER) { @@ -1329,8 +1402,223 @@ public: { task_pool.cancel(); } + + friend class CUDASplitKernelFunction; + friend class CUDASplitKernel; +}; + +/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class + * now that the definition of that class is complete + */ +#undef cuda_assert +#define cuda_assert(stmt) \ + { \ + CUresult result = stmt; \ + \ + if(result != CUDA_SUCCESS) { \ + string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ + if(device->error_msg == "") \ + device->error_msg = message; \ + fprintf(stderr, "%s\n", message.c_str()); \ + /*cuda_abort();*/ \ + device->cuda_error_documentation(); \ + } \ + } (void)0 + +/* split kernel */ + +class CUDASplitKernelFunction : public SplitKernelFunction{ + CUDADevice* device; + CUfunction func; +public: + CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {} + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/) + { + return enqueue(dim, NULL); + } + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, void *args[]) + { + device->cuda_push_context(); + + if(device->have_error()) + return false; + + /* we ignore dim.local_size for now, as this is faster */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); + + int xthreads = (int)sqrt(threads_per_block); + int ythreads = (int)sqrt(threads_per_block); + + int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads; + int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads; + + cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); + + cuda_assert(cuLaunchKernel(func, + xblocks , yblocks, 1, /* blocks */ + xthreads, ythreads, 1, /* threads */ + 0, 0, args, 0)); + + device->cuda_pop_context(); + + return !device->have_error(); + } }; +CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device) +{ +} + +uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads) +{ + device_vector<uint64_t> size_buffer; + size_buffer.resize(1); + device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE); + + device->cuda_push_context(); + + uint threads = num_threads; + CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer); + + struct args_t { + uint* num_threads; + CUdeviceptr* size; + }; + + args_t args = { + &threads, + &d_size + }; + + CUfunction state_buffer_size; + cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size")); + + cuda_assert(cuLaunchKernel(state_buffer_size, + 1, 1, 1, + 1, 1, 1, + 0, 0, (void**)&args, 0)); + + device->cuda_pop_context(); + + device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t)); + device->mem_free(size_buffer); + + return *size_buffer.get_data(); +} + +bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& /*kernel_globals*/, + device_memory& /*kernel_data*/, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs) +{ + device->cuda_push_context(); + + CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer); + CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer); + CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer); + CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer); + CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer); + + CUdeviceptr d_rng_state = device->cuda_device_ptr(rtile.rng_state); + CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer); + + int end_sample = rtile.start_sample + rtile.num_samples; + int queue_size = dim.global_size[0] * dim.global_size[1]; + + struct args_t { + CUdeviceptr* split_data_buffer; + int* num_elements; + CUdeviceptr* ray_state; + CUdeviceptr* rng_state; + int* start_sample; + int* end_sample; + int* sx; + int* sy; + int* sw; + int* sh; + int* offset; + int* stride; + CUdeviceptr* queue_index; + int* queuesize; + CUdeviceptr* use_queues_flag; + CUdeviceptr* work_pool_wgs; + int* num_samples; + CUdeviceptr* buffer; + }; + + args_t args = { + &d_split_data, + &num_global_elements, + &d_ray_state, + &d_rng_state, + &rtile.start_sample, + &end_sample, + &rtile.x, + &rtile.y, + &rtile.w, + &rtile.h, + &rtile.offset, + &rtile.stride, + &d_queue_index, + &queue_size, + &d_use_queues_flag, + &d_work_pool_wgs, + &rtile.num_samples, + &d_buffer + }; + + CUfunction data_init; + cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init")); + if(device->have_error()) { + return false; + } + + CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args); + + device->cuda_pop_context(); + + return !device->have_error(); +} + +SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) +{ + CUfunction func; + + device->cuda_push_context(); + + cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data())); + if(device->have_error()) { + device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data())); + return NULL; + } + + device->cuda_pop_context(); + + return new CUDASplitKernelFunction(device, func); +} + +int2 CUDASplitKernel::split_kernel_local_size() +{ + return make_int2(32, 1); +} + +int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask */*task*/) +{ + /* TODO(mai): implement something here to detect ideal work size */ + return make_int2(256, 256); +} + bool device_cuda_init(void) { #ifdef WITH_CUDA_DYNLOAD diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 5b5b4dc6802..4b10514a9d2 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -28,10 +28,10 @@ * other devices this is a pointer to device memory, where we will copy memory * to and from. */ -#include "util_debug.h" -#include "util_half.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_debug.h" +#include "util/util_half.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -48,7 +48,8 @@ enum DataType { TYPE_UINT, TYPE_INT, TYPE_FLOAT, - TYPE_HALF + TYPE_HALF, + TYPE_UINT64, }; static inline size_t datatype_size(DataType datatype) @@ -59,6 +60,7 @@ static inline size_t datatype_size(DataType datatype) case TYPE_UINT: return sizeof(uint); case TYPE_INT: return sizeof(int); case TYPE_HALF: return sizeof(half); + case TYPE_UINT64: return sizeof(uint64_t); default: return 0; } } @@ -160,6 +162,11 @@ template<> struct device_type_traits<half4> { static const int num_elements = 4; }; +template<> struct device_type_traits<uint64_t> { + static const DataType data_type = TYPE_UINT64; + static const int num_elements = 1; +}; + /* Device Memory */ class device_memory @@ -180,10 +187,27 @@ public: /* device pointer */ device_ptr device_pointer; -protected: - device_memory() {} + device_memory() + { + data_type = device_type_traits<uchar>::data_type; + data_elements = device_type_traits<uchar>::num_elements; + data_pointer = 0; + data_size = 0; + device_size = 0; + data_width = 0; + data_height = 0; + data_depth = 0; + device_pointer = 0; + } virtual ~device_memory() { assert(!device_pointer); } + void resize(size_t size) + { + data_size = size; + data_width = size; + } + +protected: /* no copying */ device_memory(const device_memory&); device_memory& operator = (const device_memory&); @@ -198,16 +222,8 @@ public: { data_type = device_type_traits<T>::data_type; data_elements = device_type_traits<T>::num_elements; - data_pointer = 0; - data_size = 0; - device_size = 0; - data_width = 0; - data_height = 0; - data_depth = 0; assert(data_elements > 0); - - device_pointer = 0; } virtual ~device_vector() {} @@ -266,6 +282,7 @@ public: data_height = 0; data_depth = 0; data_size = 0; + device_pointer = 0; } size_t size() diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 31b800640d3..624260a81c8 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -17,17 +17,17 @@ #include <stdlib.h> #include <sstream> -#include "device.h" -#include "device_intern.h" -#include "device_network.h" +#include "device/device.h" +#include "device/device_intern.h" +#include "device/device_network.h" -#include "buffers.h" +#include "render/buffers.h" -#include "util_foreach.h" -#include "util_list.h" -#include "util_logging.h" -#include "util_map.h" -#include "util_time.h" +#include "util/util_foreach.h" +#include "util/util_list.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_time.h" CCL_NAMESPACE_BEGIN @@ -106,11 +106,11 @@ public: return true; } - void mem_alloc(device_memory& mem, MemoryType type) + void mem_alloc(const char *name, device_memory& mem, MemoryType type) { foreach(SubDevice& sub, devices) { mem.device_pointer = 0; - sub.device->mem_alloc(mem, type); + sub.device->mem_alloc(name, mem, type); sub.ptr_map[unique_ptr] = mem.device_pointer; } @@ -162,6 +162,7 @@ public: void mem_free(device_memory& mem) { device_ptr tmp = mem.device_pointer; + stats.mem_free(mem.device_size); foreach(SubDevice& sub, devices) { mem.device_pointer = sub.ptr_map[tmp]; @@ -170,7 +171,6 @@ public: } mem.device_pointer = 0; - stats.mem_free(mem.device_size); } void const_copy_to(const char *name, void *host, size_t size) @@ -202,6 +202,7 @@ public: void tex_free(device_memory& mem) { device_ptr tmp = mem.device_pointer; + stats.mem_free(mem.device_size); foreach(SubDevice& sub, devices) { mem.device_pointer = sub.ptr_map[tmp]; @@ -210,7 +211,6 @@ public: } mem.device_pointer = 0; - stats.mem_free(mem.device_size); } void pixels_alloc(device_memory& mem) diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp index 53eef6cf199..66758954f44 100644 --- a/intern/cycles/device/device_network.cpp +++ b/intern/cycles/device/device_network.cpp @@ -14,12 +14,12 @@ * limitations under the License. */ -#include "device.h" -#include "device_intern.h" -#include "device_network.h" +#include "device/device.h" +#include "device/device_intern.h" +#include "device/device_network.h" -#include "util_foreach.h" -#include "util_logging.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" #if defined(WITH_NETWORK) @@ -87,8 +87,14 @@ public: snd.write(); } - void mem_alloc(device_memory& mem, MemoryType type) + void mem_alloc(const char *name, device_memory& mem, MemoryType type) { + if(name) { + VLOG(1) << "Buffer allocate: " << name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + thread_scoped_lock lock(rpc_lock); mem.device_pointer = ++mem_counter; @@ -481,7 +487,7 @@ protected: mem.data_pointer = 0; /* perform the allocation on the actual device */ - device->mem_alloc(mem, type); + device->mem_alloc(NULL, mem, type); /* store a mapping to/from client_pointer and real device pointer */ pointer_mapping_insert(client_pointer, mem.device_pointer); diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h index d28cfe3121f..a5d24c66018 100644 --- a/intern/cycles/device/device_network.h +++ b/intern/cycles/device/device_network.h @@ -33,12 +33,12 @@ #include <sstream> #include <deque> -#include "buffers.h" +#include "render/buffers.h" -#include "util_foreach.h" -#include "util_list.h" -#include "util_map.h" -#include "util_string.h" +#include "util/util_foreach.h" +#include "util/util_list.h" +#include "util/util_map.h" +#include "util/util_string.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index ba94c592a5f..edd2047debc 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -16,12 +16,12 @@ #ifdef WITH_OPENCL -#include "opencl/opencl.h" +#include "device/opencl/opencl.h" -#include "device_intern.h" +#include "device/device_intern.h" -#include "util_foreach.h" -#include "util_logging.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp new file mode 100644 index 00000000000..ae462a560b7 --- /dev/null +++ b/intern/cycles/device/device_split_kernel.cpp @@ -0,0 +1,306 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device_split_kernel.h" + +#include "kernel/kernel_types.h" +#include "kernel/split/kernel_split_data_types.h" + +#include "util/util_time.h" + +CCL_NAMESPACE_BEGIN + +static const double alpha = 0.1; /* alpha for rolling average */ + +DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device) +{ + current_max_closure = -1; + first_tile = true; + + avg_time_per_sample = 0.0; + + kernel_path_init = NULL; + kernel_scene_intersect = NULL; + kernel_lamp_emission = NULL; + kernel_do_volume = NULL; + kernel_queue_enqueue = NULL; + kernel_indirect_background = NULL; + kernel_shader_eval = NULL; + kernel_holdout_emission_blurring_pathtermination_ao = NULL; + kernel_subsurface_scatter = NULL; + kernel_direct_lighting = NULL; + kernel_shadow_blocked_ao = NULL; + kernel_shadow_blocked_dl = NULL; + kernel_next_iteration_setup = NULL; + kernel_indirect_subsurface = NULL; + kernel_buffer_update = NULL; +} + +DeviceSplitKernel::~DeviceSplitKernel() +{ + device->mem_free(split_data); + device->mem_free(ray_state); + device->mem_free(use_queues_flag); + device->mem_free(queue_index); + device->mem_free(work_pool_wgs); + + delete kernel_path_init; + delete kernel_scene_intersect; + delete kernel_lamp_emission; + delete kernel_do_volume; + delete kernel_queue_enqueue; + delete kernel_indirect_background; + delete kernel_shader_eval; + delete kernel_holdout_emission_blurring_pathtermination_ao; + delete kernel_subsurface_scatter; + delete kernel_direct_lighting; + delete kernel_shadow_blocked_ao; + delete kernel_shadow_blocked_dl; + delete kernel_next_iteration_setup; + delete kernel_indirect_subsurface; + delete kernel_buffer_update; +} + +bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features) +{ +#define LOAD_KERNEL(name) \ + kernel_##name = get_split_kernel_function(#name, requested_features); \ + if(!kernel_##name) { \ + return false; \ + } + + LOAD_KERNEL(path_init); + LOAD_KERNEL(scene_intersect); + LOAD_KERNEL(lamp_emission); + LOAD_KERNEL(do_volume); + LOAD_KERNEL(queue_enqueue); + LOAD_KERNEL(indirect_background); + LOAD_KERNEL(shader_eval); + LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao); + LOAD_KERNEL(subsurface_scatter); + LOAD_KERNEL(direct_lighting); + LOAD_KERNEL(shadow_blocked_ao); + LOAD_KERNEL(shadow_blocked_dl); + LOAD_KERNEL(next_iteration_setup); + LOAD_KERNEL(indirect_subsurface); + LOAD_KERNEL(buffer_update); + +#undef LOAD_KERNEL + + current_max_closure = requested_features.max_closure; + + return true; +} + +size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size) +{ + uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024; + return max_buffer_size / size_per_element; +} + +bool DeviceSplitKernel::path_trace(DeviceTask *task, + RenderTile& tile, + device_memory& kgbuffer, + device_memory& kernel_data) +{ + if(device->have_error()) { + return false; + } + + /* Get local size */ + size_t local_size[2]; + { + int2 lsize = split_kernel_local_size(); + local_size[0] = lsize[0]; + local_size[1] = lsize[1]; + } + + /* Set gloabl size */ + size_t global_size[2]; + { + int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task); + + /* Make sure that set work size is a multiple of local + * work size dimensions. + */ + global_size[0] = round_up(gsize[0], local_size[0]); + global_size[1] = round_up(gsize[1], local_size[1]); + } + + /* Number of elements in the global state buffer */ + int num_global_elements = global_size[0] * global_size[1]; + assert(num_global_elements % WORK_POOL_SIZE == 0); + + /* Allocate all required global memory once. */ + if(first_tile) { + first_tile = false; + + /* Calculate max groups */ + + /* Denotes the maximum work groups possible w.r.t. current requested tile size. */ + unsigned int max_work_groups = num_global_elements / WORK_POOL_SIZE + 1; + + /* Allocate work_pool_wgs memory. */ + work_pool_wgs.resize(max_work_groups * sizeof(unsigned int)); + device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE); + + queue_index.resize(NUM_QUEUES * sizeof(int)); + device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE); + + use_queues_flag.resize(sizeof(char)); + device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE); + + ray_state.resize(num_global_elements); + device->mem_alloc("ray_state", ray_state, MEM_READ_WRITE); + + split_data.resize(state_buffer_size(kgbuffer, kernel_data, num_global_elements)); + device->mem_alloc("split_data", split_data, MEM_READ_WRITE); + } + +#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \ + if(device->have_error()) { \ + return false; \ + } \ + if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \ + return false; \ + } + + tile.sample = tile.start_sample; + + /* for exponential increase between tile updates */ + int time_multiplier = 1; + + while(tile.sample < tile.start_sample + tile.num_samples) { + /* to keep track of how long it takes to run a number of samples */ + double start_time = time_dt(); + + /* initial guess to start rolling average */ + const int initial_num_samples = 1; + /* approx number of samples per second */ + int samples_per_second = (avg_time_per_sample > 0.0) ? + int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples; + + RenderTile subtile = tile; + subtile.start_sample = tile.sample; + subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample); + + if(device->have_error()) { + return false; + } + + /* reset state memory here as global size for data_init + * kernel might not be large enough to do in kernel + */ + device->mem_zero(work_pool_wgs); + device->mem_zero(split_data); + device->mem_zero(ray_state); + + if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size), + subtile, + num_global_elements, + kgbuffer, + kernel_data, + split_data, + ray_state, + queue_index, + use_queues_flag, + work_pool_wgs)) + { + return false; + } + + ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size); + + bool activeRaysAvailable = true; + + while(activeRaysAvailable) { + /* Do path-iteration in host [Enqueue Path-iteration kernels. */ + for(int PathIter = 0; PathIter < 16; PathIter++) { + ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size); + + if(task->get_cancel()) { + return true; + } + } + + /* Decide if we should exit path-iteration in host. */ + device->mem_copy_from(ray_state, 0, global_size[0] * global_size[1] * sizeof(char), 1, 1); + + activeRaysAvailable = false; + + for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) { + int8_t state = ray_state.get_data()[rayStateIter]; + + if(state != RAY_INACTIVE) { + if(state == RAY_INVALID) { + /* Something went wrong, abort to avoid looping endlessly. */ + device->set_error("Split kernel error: invalid ray state"); + return false; + } + + /* Not all rays are RAY_INACTIVE. */ + activeRaysAvailable = true; + break; + } + } + + if(task->get_cancel()) { + return true; + } + } + + double time_per_sample = ((time_dt()-start_time) / subtile.num_samples); + + if(avg_time_per_sample == 0.0) { + /* start rolling average */ + avg_time_per_sample = time_per_sample; + } + else { + avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample; + } + +#undef ENQUEUE_SPLIT_KERNEL + + tile.sample += subtile.num_samples; + task->update_progress(&tile, tile.w*tile.h*subtile.num_samples); + + time_multiplier = min(time_multiplier << 1, 10); + + if(task->get_cancel()) { + return true; + } + } + + return true; +} + +CCL_NAMESPACE_END + + diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h new file mode 100644 index 00000000000..15a94953a11 --- /dev/null +++ b/intern/cycles/device/device_split_kernel.h @@ -0,0 +1,132 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __DEVICE_SPLIT_KERNEL_H__ +#define __DEVICE_SPLIT_KERNEL_H__ + +#include "device/device.h" +#include "render/buffers.h" + +CCL_NAMESPACE_BEGIN + +/* When allocate global memory in chunks. We may not be able to + * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks; + * Since some bytes may be needed for aligning chunks of memory; + * This is the amount of memory that we dedicate for that purpose. + */ +#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB + +/* Types used for split kernel */ + +class KernelDimensions { +public: + size_t global_size[2]; + size_t local_size[2]; + + KernelDimensions(size_t global_size_[2], size_t local_size_[2]) + { + memcpy(global_size, global_size_, sizeof(global_size)); + memcpy(local_size, local_size_, sizeof(local_size)); + } +}; + +class SplitKernelFunction { +public: + virtual ~SplitKernelFunction() {} + + /* enqueue the kernel, returns false if there is an error */ + virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) = 0; +}; + +class DeviceSplitKernel { +private: + Device *device; + + SplitKernelFunction *kernel_path_init; + SplitKernelFunction *kernel_scene_intersect; + SplitKernelFunction *kernel_lamp_emission; + SplitKernelFunction *kernel_do_volume; + SplitKernelFunction *kernel_queue_enqueue; + SplitKernelFunction *kernel_indirect_background; + SplitKernelFunction *kernel_shader_eval; + SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao; + SplitKernelFunction *kernel_subsurface_scatter; + SplitKernelFunction *kernel_direct_lighting; + SplitKernelFunction *kernel_shadow_blocked_ao; + SplitKernelFunction *kernel_shadow_blocked_dl; + SplitKernelFunction *kernel_next_iteration_setup; + SplitKernelFunction *kernel_indirect_subsurface; + SplitKernelFunction *kernel_buffer_update; + + /* Global memory variables [porting]; These memory is used for + * co-operation between different kernels; Data written by one + * kernel will be available to another kernel via this global + * memory. + */ + device_memory split_data; + device_vector<uchar> ray_state; + device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */ + + /* Flag to make sceneintersect and lampemission kernel use queues. */ + device_memory use_queues_flag; + + /* Approximate time it takes to complete one sample */ + double avg_time_per_sample; + + /* Work pool with respect to each work group. */ + device_memory work_pool_wgs; + + /* clos_max value for which the kernels have been loaded currently. */ + int current_max_closure; + + /* Marked True in constructor and marked false at the end of path_trace(). */ + bool first_tile; + +public: + explicit DeviceSplitKernel(Device* device); + virtual ~DeviceSplitKernel(); + + bool load_kernels(const DeviceRequestedFeatures& requested_features); + bool path_trace(DeviceTask *task, + RenderTile& rtile, + device_memory& kgbuffer, + device_memory& kernel_data); + + virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0; + size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs) = 0; + + virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0; + virtual int2 split_kernel_local_size() = 0; + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0; +}; + +CCL_NAMESPACE_END + +#endif /* __DEVICE_SPLIT_KERNEL_H__ */ + + + diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index 48d18035c13..ca303365627 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -17,12 +17,12 @@ #include <stdlib.h> #include <string.h> -#include "device_task.h" +#include "device/device_task.h" -#include "buffers.h" +#include "render/buffers.h" -#include "util_algorithm.h" -#include "util_time.h" +#include "util/util_algorithm.h" +#include "util/util_time.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 8bd54c3d2b0..feee89fd6e4 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -17,11 +17,11 @@ #ifndef __DEVICE_TASK_H__ #define __DEVICE_TASK_H__ -#include "device_memory.h" +#include "device/device_memory.h" -#include "util_function.h" -#include "util_list.h" -#include "util_task.h" +#include "util/util_function.h" +#include "util/util_list.h" +#include "util/util_task.h" CCL_NAMESPACE_BEGIN @@ -51,6 +51,8 @@ public: int shader_filter; int shader_x, shader_w; + int passes_size; + explicit DeviceTask(Type type = PATH_TRACE); int get_subtask_count(int num, int max_size = 0); diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 4023ba89a10..764216d0dfa 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -16,39 +16,39 @@ #ifdef WITH_OPENCL -#include "device.h" +#include "device/device.h" -#include "util_map.h" -#include "util_param.h" -#include "util_string.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_string.h" #include "clew.h" CCL_NAMESPACE_BEGIN -#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p)) - -/* Macro declarations used with split kernel */ - -/* Macro to enable/disable work-stealing */ -#define __WORK_STEALING__ - -#define SPLIT_KERNEL_LOCAL_SIZE_X 64 -#define SPLIT_KERNEL_LOCAL_SIZE_Y 1 +/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */ +#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS +/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */ +# undef clEnqueueNDRangeKernel +# define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \ + clFinish(a); \ + CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \ + clFinish(a); + +# undef clEnqueueWriteBuffer +# define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \ + clFinish(a); \ + CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \ + clFinish(a); + +# undef clEnqueueReadBuffer +# define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \ + clFinish(a); \ + CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \ + clFinish(a); +#endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */ -/* This value may be tuned according to the scene we are rendering. - * - * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected - * ray-bounces will improve performance. - */ -#define PATH_ITER_INC_FACTOR 8 - -/* When allocate global memory in chunks. We may not be able to - * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks; - * Since some bytes may be needed for aligning chunks of memory; - * This is the amount of memory that we dedicate for that purpose. - */ -#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB +#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p)) struct OpenCLPlatformDevice { OpenCLPlatformDevice(cl_platform_id platform_id, @@ -90,6 +90,54 @@ public: cl_device_id device_id); static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, bool force_all = false); + static bool use_single_program(); + + /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */ + + /* Platform information. */ + static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL); + static cl_uint get_num_platforms(); + + static bool get_platforms(vector<cl_platform_id> *platform_ids, + cl_int *error = NULL); + static vector<cl_platform_id> get_platforms(); + + static bool get_platform_name(cl_platform_id platform_id, + string *platform_name); + static string get_platform_name(cl_platform_id platform_id); + + static bool get_num_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + cl_uint *num_devices, + cl_int *error = NULL); + static cl_uint get_num_platform_devices(cl_platform_id platform_id, + cl_device_type device_type); + + static bool get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + vector<cl_device_id> *device_ids, + cl_int* error = NULL); + static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type); + + /* Device information. */ + static bool get_device_name(cl_device_id device_id, + string *device_name, + cl_int* error = NULL); + + static string get_device_name(cl_device_id device_id); + + static bool get_device_type(cl_device_id device_id, + cl_device_type *device_type, + cl_int* error = NULL); + static cl_device_type get_device_type(cl_device_id device_id); + + /* Get somewhat more readable device name. + * Main difference is AMD OpenCL here which only gives code name + * for the regular device name. This will give more sane device + * name using some extensions. + */ + static string get_readable_device_name(cl_device_id device_id); }; /* Thread safe cache for contexts and programs. @@ -248,6 +296,7 @@ public: bool device_initialized; string platform_name; + string device_name; bool opencl_error(cl_int err); void opencl_error(const string& message); @@ -266,10 +315,10 @@ public: /* Has to be implemented by the real device classes. * The base device will then load all these programs. */ - virtual void load_kernels(const DeviceRequestedFeatures& requested_features, + virtual bool load_kernels(const DeviceRequestedFeatures& requested_features, vector<OpenCLProgram*> &programs) = 0; - void mem_alloc(device_memory& mem, MemoryType type); + void mem_alloc(const char *name, device_memory& mem, MemoryType type); void mem_copy_to(device_memory& mem); void mem_copy_from(device_memory& mem, int y, int w, int h, int elem); void mem_zero(device_memory& mem); @@ -326,16 +375,39 @@ protected: class ArgumentWrapper { public: - ArgumentWrapper() : size(0), pointer(NULL) {} - template <typename T> + ArgumentWrapper() : size(0), pointer(NULL) + { + } + + ArgumentWrapper(device_memory& argument) : size(sizeof(void*)), + pointer((void*)(&argument.device_pointer)) + { + } + + template<typename T> + ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)), + pointer((void*)(&argument.device_pointer)) + { + } + + template<typename T> ArgumentWrapper(T& argument) : size(sizeof(argument)), - pointer(&argument) { } + pointer(&argument) + { + } + ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), - pointer(&int_value) { } + pointer(&int_value) + { + } + ArgumentWrapper(float argument) : size(sizeof(float)), float_value(argument), - pointer(&float_value) { } + pointer(&float_value) + { + } + size_t size; int int_value; float float_value; diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index a2b900312e7..52d0662a8e3 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -16,15 +16,15 @@ #ifdef WITH_OPENCL -#include "opencl.h" +#include "device/opencl/opencl.h" -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_md5.h" -#include "util_path.h" -#include "util_time.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_md5.h" +#include "util/util_path.h" +#include "util/util_time.h" CCL_NAMESPACE_BEGIN @@ -82,9 +82,10 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou cpPlatform = platform_device.platform_id; cdDevice = platform_device.device_id; platform_name = platform_device.platform_name; + device_name = platform_device.device_name; VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device " - << platform_device.device_name << "."; + << device_name << "."; { /* try to use cached context */ @@ -113,12 +114,16 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou } cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr); - if(opencl_error(ciErr)) + if(opencl_error(ciErr)) { + opencl_error("OpenCL: Error creating command queue"); return; + } null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr); - if(opencl_error(ciErr)) + if(opencl_error(ciErr)) { + opencl_error("OpenCL: Error creating memory buffer for NULL"); return; + } fprintf(stderr, "Device init success\n"); device_initialized = true; @@ -147,10 +152,8 @@ OpenCLDeviceBase::~OpenCLDeviceBase() void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info, const void * /*private_info*/, size_t /*cb*/, void *user_data) { - char name[256]; - clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL); - - fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info); + string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data); + fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info); } bool OpenCLDeviceBase::opencl_version_check() @@ -191,6 +194,8 @@ string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options) bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features) { + VLOG(2) << "Loading kernels for platform " << platform_name + << ", device " << device_name << "."; /* Verify if device was initialized. */ if(!device_initialized) { fprintf(stderr, "OpenCL: failed to initialize device.\n"); @@ -206,11 +211,14 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea base_program.add_kernel(ustring("convert_to_half_float")); base_program.add_kernel(ustring("shader")); base_program.add_kernel(ustring("bake")); + base_program.add_kernel(ustring("zero_buffer")); vector<OpenCLProgram*> programs; programs.push_back(&base_program); /* Call actual class to fill the vector with its programs. */ - load_kernels(requested_features, programs); + if(!load_kernels(requested_features, programs)) { + return false; + } /* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks * serialize the calls internally, so it's not much use right now. @@ -242,8 +250,14 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea return true; } -void OpenCLDeviceBase::mem_alloc(device_memory& mem, MemoryType type) +void OpenCLDeviceBase::mem_alloc(const char *name, device_memory& mem, MemoryType type) { + if(name) { + VLOG(1) << "Buffer allocate: " << name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + size_t size = mem.memory_size(); cl_mem_flags mem_flag; @@ -311,8 +325,61 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in void OpenCLDeviceBase::mem_zero(device_memory& mem) { if(mem.device_pointer) { - memset((void*)mem.data_pointer, 0, mem.memory_size()); - mem_copy_to(mem); + if(base_program.is_loaded()) { + cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); + + size_t global_size[] = {1024, 1024}; + size_t num_threads = global_size[0] * global_size[1]; + + cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer); + cl_ulong d_offset = 0; + cl_ulong d_size = 0; + + while(d_offset < mem.memory_size()) { + d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset); + + kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); + + ciErr = clEnqueueNDRangeKernel(cqCommandQueue, + ckZeroBuffer, + 2, + NULL, + global_size, + NULL, + 0, + NULL, + NULL); + opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); + + d_offset += d_size; + } + } + + if(mem.data_pointer) { + memset((void*)mem.data_pointer, 0, mem.memory_size()); + } + + if(!base_program.is_loaded()) { + void* zero = (void*)mem.data_pointer; + + if(!mem.data_pointer) { + zero = util_aligned_malloc(mem.memory_size(), 16); + memset(zero, 0, mem.memory_size()); + } + + opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, + CL_MEM_PTR(mem.device_pointer), + CL_TRUE, + 0, + mem.memory_size(), + zero, + 0, + NULL, NULL)); + + if(!mem.data_pointer) { + util_aligned_free(zero); + } + } } } @@ -337,7 +404,7 @@ void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size) device_vector<uchar> *data = new device_vector<uchar>(); data->copy((uchar*)host, size); - mem_alloc(*data, MEM_READ_ONLY); + mem_alloc(name, *data, MEM_READ_ONLY); i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first; } else { @@ -356,7 +423,7 @@ void OpenCLDeviceBase::tex_alloc(const char *name, VLOG(1) << "Texture allocate: " << name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; - mem_alloc(mem, MEM_READ_ONLY); + mem_alloc(NULL, mem, MEM_READ_ONLY); mem_copy_to(mem); assert(mem_map.find(name) == mem_map.end()); mem_map.insert(MemMap::value_type(name, mem.device_pointer)); @@ -460,7 +527,7 @@ void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ #define KERNEL_TEX(type, ttype, name) \ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name); -#include "kernel_textures.h" +#include "kernel/kernel_textures.h" #undef KERNEL_TEX start_arg_index += kernel_set_args(ckFilmConvertKernel, @@ -511,7 +578,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task) #define KERNEL_TEX(type, ttype, name) \ set_kernel_arg_mem(kernel, &start_arg_index, #name); -#include "kernel_textures.h" +#include "kernel/kernel_textures.h" #undef KERNEL_TEX start_arg_index += kernel_set_args(kernel, diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp index 6ea7619e022..a2fd1d71156 100644 --- a/intern/cycles/device/opencl/opencl_mega.cpp +++ b/intern/cycles/device/opencl/opencl_mega.cpp @@ -16,15 +16,15 @@ #ifdef WITH_OPENCL -#include "opencl.h" +#include "device/opencl/opencl.h" -#include "buffers.h" +#include "render/buffers.h" -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "util_md5.h" -#include "util_path.h" -#include "util_time.h" +#include "util/util_md5.h" +#include "util/util_path.h" +#include "util/util_time.h" CCL_NAMESPACE_BEGIN @@ -43,11 +43,12 @@ public: return true; } - virtual void load_kernels(const DeviceRequestedFeatures& /*requested_features*/, + virtual bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/, vector<OpenCLProgram*> &programs) { path_trace_program.add_kernel(ustring("path_trace")); programs.push_back(&path_trace_program); + return true; } ~OpenCLDeviceMegaKernel() @@ -83,7 +84,7 @@ public: #define KERNEL_TEX(type, ttype, name) \ set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name); -#include "kernel_textures.h" +#include "kernel/kernel_textures.h" #undef KERNEL_TEX start_arg_index += kernel_set_args(ckPathTraceKernel, diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index 3c3c2150128..579dbc84f53 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -16,1290 +16,359 @@ #ifdef WITH_OPENCL -#include "opencl.h" +#include "device/opencl/opencl.h" -#include "buffers.h" +#include "render/buffers.h" -#include "kernel_types.h" +#include "kernel/kernel_types.h" +#include "kernel/split/kernel_split_data_types.h" -#include "util_md5.h" -#include "util_path.h" -#include "util_time.h" +#include "device/device_split_kernel.h" + +#include "util/util_logging.h" +#include "util/util_md5.h" +#include "util/util_path.h" +#include "util/util_time.h" CCL_NAMESPACE_BEGIN -/* TODO(sergey): This is to keep tile split on OpenCL level working - * for now, since without this view-port render does not work as it - * should. - * - * Ideally it'll be done on the higher level, but we need to get ready - * for merge rather soon, so let's keep split logic private here in - * the file. - */ -class SplitRenderTile : public RenderTile { -public: - SplitRenderTile() - : RenderTile(), - buffer_offset_x(0), - buffer_offset_y(0), - rng_state_offset_x(0), - rng_state_offset_y(0), - buffer_rng_state_stride(0) {} - - explicit SplitRenderTile(RenderTile& tile) - : RenderTile(), - buffer_offset_x(0), - buffer_offset_y(0), - rng_state_offset_x(0), - rng_state_offset_y(0), - buffer_rng_state_stride(0) - { - x = tile.x; - y = tile.y; - w = tile.w; - h = tile.h; - start_sample = tile.start_sample; - num_samples = tile.num_samples; - sample = tile.sample; - resolution = tile.resolution; - offset = tile.offset; - stride = tile.stride; - buffer = tile.buffer; - rng_state = tile.rng_state; - buffers = tile.buffers; +class OpenCLSplitKernel; + +static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features) +{ + string build_options = "-D__SPLIT_KERNEL__ "; + build_options += requested_features.get_build_options(); + + /* Set compute device build option. */ + cl_device_type device_type; + OpenCLInfo::get_device_type(device->cdDevice, &device_type, &device->ciErr); + assert(device->ciErr == CL_SUCCESS); + if(device_type == CL_DEVICE_TYPE_GPU) { + build_options += " -D__COMPUTE_DEVICE_GPU__"; } - /* Split kernel is device global memory constrained; - * hence split kernel cant render big tile size's in - * one go. If the user sets a big tile size (big tile size - * is a term relative to the available device global memory), - * we split the tile further and then call path_trace on - * each of those split tiles. The following variables declared, - * assist in achieving that purpose - */ - int buffer_offset_x; - int buffer_offset_y; - int rng_state_offset_x; - int rng_state_offset_y; - int buffer_rng_state_stride; -}; + return build_options; +} /* OpenCLDeviceSplitKernel's declaration/definition. */ class OpenCLDeviceSplitKernel : public OpenCLDeviceBase { public: - /* Kernel declaration. */ + DeviceSplitKernel *split_kernel; OpenCLProgram program_data_init; - OpenCLProgram program_scene_intersect; - OpenCLProgram program_lamp_emission; - OpenCLProgram program_queue_enqueue; - OpenCLProgram program_background_buffer_update; - OpenCLProgram program_shader_eval; - OpenCLProgram program_holdout_emission_blurring_pathtermination_ao; - OpenCLProgram program_direct_lighting; - OpenCLProgram program_shadow_blocked; - OpenCLProgram program_next_iteration_setup; - OpenCLProgram program_sum_all_radiance; - - /* Global memory variables [porting]; These memory is used for - * co-operation between different kernels; Data written by one - * kernel will be available to another kernel via this global - * memory. - */ - cl_mem rng_coop; - cl_mem throughput_coop; - cl_mem L_transparent_coop; - cl_mem PathRadiance_coop; - cl_mem Ray_coop; - cl_mem PathState_coop; - cl_mem Intersection_coop; - cl_mem kgbuffer; /* KernelGlobals buffer. */ - - /* Global buffers for ShaderData. */ - cl_mem sd; /* ShaderData used in the main path-iteration loop. */ - cl_mem sd_DL_shadow; /* ShaderData used in Direct Lighting and - * shadow_blocked kernel. - */ - - /* Global memory required for shadow blocked and accum_radiance. */ - cl_mem BSDFEval_coop; - cl_mem ISLamp_coop; - cl_mem LightRay_coop; - cl_mem AOAlpha_coop; - cl_mem AOBSDF_coop; - cl_mem AOLightRay_coop; - cl_mem Intersection_coop_shadow; - -#ifdef WITH_CYCLES_DEBUG - /* DebugData memory */ - cl_mem debugdata_coop; -#endif - - /* Global state array that tracks ray state. */ - cl_mem ray_state; - - /* Per sample buffers. */ - cl_mem per_sample_output_buffers; - - /* Denotes which sample each ray is being processed for. */ - cl_mem work_array; - - /* Queue */ - cl_mem Queue_data; /* Array of size queuesize * num_queues * sizeof(int). */ - cl_mem Queue_index; /* Array of size num_queues * sizeof(int); - * Tracks the size of each queue. - */ - - /* Flag to make sceneintersect and lampemission kernel use queues. */ - cl_mem use_queues_flag; - - /* Amount of memory in output buffer associated with one pixel/thread. */ - size_t per_thread_output_buffer_size; - - /* Total allocatable available device memory. */ - size_t total_allocatable_memory; - - /* host version of ray_state; Used in checking host path-iteration - * termination. - */ - char *hostRayStateArray; - - /* Number of path-iterations to be done in one shot. */ - unsigned int PathIteration_times; - -#ifdef __WORK_STEALING__ - /* Work pool with respect to each work group. */ - cl_mem work_pool_wgs; - - /* Denotes the maximum work groups possible w.r.t. current tile size. */ - unsigned int max_work_groups; -#endif - - /* clos_max value for which the kernels have been loaded currently. */ - int current_max_closure; - - /* Marked True in constructor and marked false at the end of path_trace(). */ - bool first_tile; - - OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_) - : OpenCLDeviceBase(info, stats, background_) - { - background = background_; - - /* Initialize cl_mem variables. */ - kgbuffer = NULL; - sd = NULL; - sd_DL_shadow = NULL; - - rng_coop = NULL; - throughput_coop = NULL; - L_transparent_coop = NULL; - PathRadiance_coop = NULL; - Ray_coop = NULL; - PathState_coop = NULL; - Intersection_coop = NULL; - ray_state = NULL; - - AOAlpha_coop = NULL; - AOBSDF_coop = NULL; - AOLightRay_coop = NULL; - BSDFEval_coop = NULL; - ISLamp_coop = NULL; - LightRay_coop = NULL; - Intersection_coop_shadow = NULL; - -#ifdef WITH_CYCLES_DEBUG - debugdata_coop = NULL; -#endif - - work_array = NULL; - - /* Queue. */ - Queue_data = NULL; - Queue_index = NULL; - use_queues_flag = NULL; - - per_sample_output_buffers = NULL; - - per_thread_output_buffer_size = 0; - hostRayStateArray = NULL; - PathIteration_times = PATH_ITER_INC_FACTOR; -#ifdef __WORK_STEALING__ - work_pool_wgs = NULL; - max_work_groups = 0; -#endif - current_max_closure = -1; - first_tile = true; - - /* Get device's maximum memory that can be allocated. */ - ciErr = clGetDeviceInfo(cdDevice, - CL_DEVICE_MAX_MEM_ALLOC_SIZE, - sizeof(size_t), - &total_allocatable_memory, - NULL); - assert(ciErr == CL_SUCCESS); - if(platform_name == "AMD Accelerated Parallel Processing") { - /* This value is tweak-able; AMD platform does not seem to - * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE - * is considered for further computation. - */ - total_allocatable_memory /= 2; - } - } + OpenCLProgram program_state_buffer_size; - virtual bool show_samples() const { - return false; - } + OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_); - /* Split kernel utility functions. */ - size_t get_tex_size(const char *tex_name) + ~OpenCLDeviceSplitKernel() { - cl_mem ptr; - size_t ret_size = 0; - MemMap::iterator i = mem_map.find(tex_name); - if(i != mem_map.end()) { - ptr = CL_MEM_PTR(i->second); - ciErr = clGetMemObjectInfo(ptr, - CL_MEM_SIZE, - sizeof(ret_size), - &ret_size, - NULL); - assert(ciErr == CL_SUCCESS); - } - return ret_size; + task_pool.stop(); + + /* Release kernels */ + program_data_init.release(); + + delete split_kernel; } - size_t get_shader_data_size(size_t max_closure) + virtual bool load_kernels(const DeviceRequestedFeatures& requested_features, + vector<OpenCLDeviceBase::OpenCLProgram*> &programs) { - /* ShaderData size with variable size ShaderClosure array */ - return sizeof(ShaderData) - (sizeof(ShaderClosure) * (MAX_CLOSURE - max_closure)); + bool single_program = OpenCLInfo::use_single_program(); + program_data_init = OpenCLDeviceBase::OpenCLProgram(this, + single_program ? "split" : "split_data_init", + single_program ? "kernel_split.cl" : "kernel_data_init.cl", + get_build_options(this, requested_features)); + + program_data_init.add_kernel(ustring("path_trace_data_init")); + programs.push_back(&program_data_init); + + program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this, + single_program ? "split" : "split_state_buffer_size", + single_program ? "kernel_split.cl" : "kernel_state_buffer_size.cl", + get_build_options(this, requested_features)); + program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size")); + programs.push_back(&program_state_buffer_size); + + return split_kernel->load_kernels(requested_features); } - /* Returns size of KernelGlobals structure associated with OpenCL. */ - size_t get_KernelGlobals_size() + void thread_run(DeviceTask *task) { - /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to - * fetch its size. - */ - typedef struct KernelGlobals { - ccl_constant KernelData *data; + if(task->type == DeviceTask::FILM_CONVERT) { + film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); + } + else if(task->type == DeviceTask::SHADER) { + shader(*task); + } + else if(task->type == DeviceTask::PATH_TRACE) { + RenderTile tile; + + /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to + * fetch its size. + */ + typedef struct KernelGlobals { + ccl_constant KernelData *data; #define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name; -#include "kernel_textures.h" + ccl_global type *name; +#include "kernel/kernel_textures.h" #undef KERNEL_TEX - void *sd_input; - void *isect_shadow; - } KernelGlobals; + SplitData split_data; + SplitParams split_param_data; + } KernelGlobals; + + /* Allocate buffer for kernel globals */ + device_memory kgbuffer; + kgbuffer.resize(sizeof(KernelGlobals)); + mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE); + + /* Keep rendering tiles until done. */ + while(task->acquire_tile(this, tile)) { + split_kernel->path_trace(task, + tile, + kgbuffer, + *const_mem_map["__data"]); + + /* Complete kernel execution before release tile. */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); - return sizeof(KernelGlobals); + task->release_tile(tile); + } + + mem_free(kgbuffer); + } + } + +protected: + /* ** Those guys are for workign around some compiler-specific bugs ** */ + + string build_options_for_base_program( + const DeviceRequestedFeatures& requested_features) + { + return requested_features.get_build_options(); } - virtual void load_kernels(const DeviceRequestedFeatures& requested_features, - vector<OpenCLProgram*> &programs) + friend class OpenCLSplitKernel; + friend class OpenCLSplitKernelFunction; +}; + +class OpenCLSplitKernelFunction : public SplitKernelFunction { +public: + OpenCLDeviceSplitKernel* device; + OpenCLDeviceBase::OpenCLProgram program; + + OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {} + ~OpenCLSplitKernelFunction() { program.release(); } + + virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) { - string build_options = "-D__SPLIT_KERNEL__ "; -#ifdef __WORK_STEALING__ - build_options += "-D__WORK_STEALING__ "; -#endif - build_options += requested_features.get_build_options(); - - /* Set compute device build option. */ - cl_device_type device_type; - ciErr = clGetDeviceInfo(cdDevice, - CL_DEVICE_TYPE, - sizeof(cl_device_type), - &device_type, - NULL); - assert(ciErr == CL_SUCCESS); - if(device_type == CL_DEVICE_TYPE_GPU) { - build_options += " -D__COMPUTE_DEVICE_GPU__"; + device->kernel_set_args(program(), 0, kg, data); + + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + program(), + 2, + NULL, + dim.global_size, + dim.local_size, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + if(device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); + return false; } -#define GLUE(a, b) a ## b -#define LOAD_KERNEL(name) \ - do { \ - GLUE(program_, name) = OpenCLProgram(this, "split_" #name, "kernel_" #name ".cl", build_options); \ - GLUE(program_, name).add_kernel(ustring("path_trace_" #name)); \ - programs.push_back(&GLUE(program_, name)); \ - } while(false) - - LOAD_KERNEL(data_init); - LOAD_KERNEL(scene_intersect); - LOAD_KERNEL(lamp_emission); - LOAD_KERNEL(queue_enqueue); - LOAD_KERNEL(background_buffer_update); - LOAD_KERNEL(shader_eval); - LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao); - LOAD_KERNEL(direct_lighting); - LOAD_KERNEL(shadow_blocked); - LOAD_KERNEL(next_iteration_setup); - LOAD_KERNEL(sum_all_radiance); - -#undef FIND_KERNEL -#undef GLUE - - current_max_closure = requested_features.max_closure; + return true; } +}; - ~OpenCLDeviceSplitKernel() +class OpenCLSplitKernel : public DeviceSplitKernel { + OpenCLDeviceSplitKernel *device; +public: + explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) { + } + + virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, + const DeviceRequestedFeatures& requested_features) { - task_pool.stop(); + OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device); - /* Release kernels */ - program_data_init.release(); - program_scene_intersect.release(); - program_lamp_emission.release(); - program_queue_enqueue.release(); - program_background_buffer_update.release(); - program_shader_eval.release(); - program_holdout_emission_blurring_pathtermination_ao.release(); - program_direct_lighting.release(); - program_shadow_blocked.release(); - program_next_iteration_setup.release(); - program_sum_all_radiance.release(); - - /* Release global memory */ - release_mem_object_safe(rng_coop); - release_mem_object_safe(throughput_coop); - release_mem_object_safe(L_transparent_coop); - release_mem_object_safe(PathRadiance_coop); - release_mem_object_safe(Ray_coop); - release_mem_object_safe(PathState_coop); - release_mem_object_safe(Intersection_coop); - release_mem_object_safe(kgbuffer); - release_mem_object_safe(sd); - release_mem_object_safe(sd_DL_shadow); - release_mem_object_safe(ray_state); - release_mem_object_safe(AOAlpha_coop); - release_mem_object_safe(AOBSDF_coop); - release_mem_object_safe(AOLightRay_coop); - release_mem_object_safe(BSDFEval_coop); - release_mem_object_safe(ISLamp_coop); - release_mem_object_safe(LightRay_coop); - release_mem_object_safe(Intersection_coop_shadow); -#ifdef WITH_CYCLES_DEBUG - release_mem_object_safe(debugdata_coop); -#endif - release_mem_object_safe(use_queues_flag); - release_mem_object_safe(Queue_data); - release_mem_object_safe(Queue_index); - release_mem_object_safe(work_array); -#ifdef __WORK_STEALING__ - release_mem_object_safe(work_pool_wgs); -#endif - release_mem_object_safe(per_sample_output_buffers); - - if(hostRayStateArray != NULL) { - free(hostRayStateArray); + bool single_program = OpenCLInfo::use_single_program(); + kernel->program = + OpenCLDeviceBase::OpenCLProgram(device, + single_program ? "split" : "split_" + kernel_name, + single_program ? "kernel_split.cl" : "kernel_" + kernel_name + ".cl", + get_build_options(device, requested_features)); + + kernel->program.add_kernel(ustring("path_trace_" + kernel_name)); + kernel->program.load(); + + if(!kernel->program.is_loaded()) { + delete kernel; + return NULL; } + + return kernel; } - void path_trace(DeviceTask *task, - SplitRenderTile& rtile, - int2 max_render_feasible_tile_size) + virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) { - /* cast arguments to cl types */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); - cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state); - cl_int d_x = rtile.x; - cl_int d_y = rtile.y; - cl_int d_w = rtile.w; - cl_int d_h = rtile.h; - cl_int d_offset = rtile.offset; - cl_int d_stride = rtile.stride; - - /* Make sure that set render feasible tile size is a multiple of local - * work size dimensions. - */ - assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0); - assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0); + device_vector<uint64_t> size_buffer; + size_buffer.resize(1); + device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE); + + uint threads = num_threads; + device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer); + + size_t global_size = 64; + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + device->program_state_buffer_size(), + 1, + NULL, + &global_size, + NULL, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t)); + device->mem_free(size_buffer); + + if(device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); + return 0; + } + + return *size_buffer.get_data(); + } - size_t global_size[2]; - size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X, - SPLIT_KERNEL_LOCAL_SIZE_Y}; + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs + ) + { + cl_int dQueue_size = dim.global_size[0] * dim.global_size[1]; /* Set the range of samples to be processed for every ray in * path-regeneration logic. */ cl_int start_sample = rtile.start_sample; cl_int end_sample = rtile.start_sample + rtile.num_samples; - cl_int num_samples = rtile.num_samples; - -#ifdef __WORK_STEALING__ - global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0]; - global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; - unsigned int num_parallel_samples = 1; -#else - global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; - unsigned int num_threads = max_render_feasible_tile_size.x * - max_render_feasible_tile_size.y; - unsigned int num_tile_columns_possible = num_threads / global_size[1]; - /* Estimate number of parallel samples that can be - * processed in parallel. - */ - unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w, - rtile.num_samples); - /* Wavefront size in AMD is 64. - * TODO(sergey): What about other platforms? - */ - if(num_parallel_samples >= 64) { - /* TODO(sergey): Could use generic round-up here. */ - num_parallel_samples = (num_parallel_samples / 64) * 64; - } - assert(num_parallel_samples != 0); - - global_size[0] = d_w * num_parallel_samples; -#endif /* __WORK_STEALING__ */ - - assert(global_size[0] * global_size[1] <= - max_render_feasible_tile_size.x * max_render_feasible_tile_size.y); - - /* Allocate all required global memory once. */ - if(first_tile) { - size_t num_global_elements = max_render_feasible_tile_size.x * - max_render_feasible_tile_size.y; - /* TODO(sergey): This will actually over-allocate if - * particular kernel does not support multiclosure. - */ - size_t shaderdata_size = get_shader_data_size(current_max_closure); - -#ifdef __WORK_STEALING__ - /* Calculate max groups */ - size_t max_global_size[2]; - size_t tile_x = max_render_feasible_tile_size.x; - size_t tile_y = max_render_feasible_tile_size.y; - max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0]; - max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1]; - max_work_groups = (max_global_size[0] * max_global_size[1]) / - (local_size[0] * local_size[1]); - /* Allocate work_pool_wgs memory. */ - work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int)); -#endif /* __WORK_STEALING__ */ - - /* Allocate queue_index memory only once. */ - Queue_index = mem_alloc(NUM_QUEUES * sizeof(int)); - use_queues_flag = mem_alloc(sizeof(char)); - kgbuffer = mem_alloc(get_KernelGlobals_size()); - - /* Create global buffers for ShaderData. */ - sd = mem_alloc(num_global_elements * shaderdata_size); - sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size); - - /* Creation of global memory buffers which are shared among - * the kernels. - */ - rng_coop = mem_alloc(num_global_elements * sizeof(RNG)); - throughput_coop = mem_alloc(num_global_elements * sizeof(float3)); - L_transparent_coop = mem_alloc(num_global_elements * sizeof(float)); - PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance)); - Ray_coop = mem_alloc(num_global_elements * sizeof(Ray)); - PathState_coop = mem_alloc(num_global_elements * sizeof(PathState)); - Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection)); - AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3)); - AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3)); - AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray)); - BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval)); - ISLamp_coop = mem_alloc(num_global_elements * sizeof(int)); - LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray)); - Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection)); - -#ifdef WITH_CYCLES_DEBUG - debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData)); -#endif - - ray_state = mem_alloc(num_global_elements * sizeof(char)); - - hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char)); - assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory"); - - Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int))); - work_array = mem_alloc(num_global_elements * sizeof(unsigned int)); - per_sample_output_buffers = mem_alloc(num_global_elements * - per_thread_output_buffer_size); - } - - cl_int dQueue_size = global_size[0] * global_size[1]; cl_uint start_arg_index = - kernel_set_args(program_data_init(), + device->kernel_set_args(device->program_data_init(), 0, - kgbuffer, - sd_DL_shadow, - d_data, - per_sample_output_buffers, - d_rng_state, - rng_coop, - throughput_coop, - L_transparent_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - Intersection_coop_shadow, - ray_state); + kernel_globals, + kernel_data, + split_data, + num_global_elements, + ray_state, + rtile.rng_state); /* TODO(sergey): Avoid map lookup here. */ #define KERNEL_TEX(type, ttype, name) \ - set_kernel_arg_mem(program_data_init(), &start_arg_index, #name); -#include "kernel_textures.h" + device->set_kernel_arg_mem(device->program_data_init(), &start_arg_index, #name); +#include "kernel/kernel_textures.h" #undef KERNEL_TEX start_arg_index += - kernel_set_args(program_data_init(), + device->kernel_set_args(device->program_data_init(), start_arg_index, start_sample, - d_x, - d_y, - d_w, - d_h, - d_offset, - d_stride, - rtile.rng_state_offset_x, - rtile.rng_state_offset_y, - rtile.buffer_rng_state_stride, - Queue_data, - Queue_index, + end_sample, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + rtile.offset, + rtile.stride, + queue_index, dQueue_size, use_queues_flag, - work_array, -#ifdef __WORK_STEALING__ work_pool_wgs, - num_samples, -#endif -#ifdef WITH_CYCLES_DEBUG - debugdata_coop, -#endif - num_parallel_samples); - - kernel_set_args(program_scene_intersect(), - 0, - kgbuffer, - d_data, - rng_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - d_w, - d_h, - Queue_data, - Queue_index, - dQueue_size, - use_queues_flag, -#ifdef WITH_CYCLES_DEBUG - debugdata_coop, -#endif - num_parallel_samples); - - kernel_set_args(program_lamp_emission(), - 0, - kgbuffer, - d_data, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - d_w, - d_h, - Queue_data, - Queue_index, - dQueue_size, - use_queues_flag, - num_parallel_samples); - - kernel_set_args(program_queue_enqueue(), - 0, - Queue_data, - Queue_index, - ray_state, - dQueue_size); - - kernel_set_args(program_background_buffer_update(), - 0, - kgbuffer, - d_data, - per_sample_output_buffers, - d_rng_state, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - L_transparent_coop, - ray_state, - d_w, - d_h, - d_x, - d_y, - d_stride, - rtile.rng_state_offset_x, - rtile.rng_state_offset_y, - rtile.buffer_rng_state_stride, - work_array, - Queue_data, - Queue_index, - dQueue_size, - end_sample, - start_sample, -#ifdef __WORK_STEALING__ - work_pool_wgs, - num_samples, -#endif -#ifdef WITH_CYCLES_DEBUG - debugdata_coop, -#endif - num_parallel_samples); - - kernel_set_args(program_shader_eval(), - 0, - kgbuffer, - d_data, - sd, - rng_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - Queue_data, - Queue_index, - dQueue_size); - - kernel_set_args(program_holdout_emission_blurring_pathtermination_ao(), - 0, - kgbuffer, - d_data, - sd, - per_sample_output_buffers, - rng_coop, - throughput_coop, - L_transparent_coop, - PathRadiance_coop, - PathState_coop, - Intersection_coop, - AOAlpha_coop, - AOBSDF_coop, - AOLightRay_coop, - d_w, - d_h, - d_x, - d_y, - d_stride, - ray_state, - work_array, - Queue_data, - Queue_index, - dQueue_size, -#ifdef __WORK_STEALING__ - start_sample, -#endif - num_parallel_samples); - - kernel_set_args(program_direct_lighting(), - 0, - kgbuffer, - d_data, - sd, - rng_coop, - PathState_coop, - ISLamp_coop, - LightRay_coop, - BSDFEval_coop, - ray_state, - Queue_data, - Queue_index, - dQueue_size); - - kernel_set_args(program_shadow_blocked(), - 0, - kgbuffer, - d_data, - PathState_coop, - LightRay_coop, - AOLightRay_coop, - ray_state, - Queue_data, - Queue_index, - dQueue_size); - - kernel_set_args(program_next_iteration_setup(), - 0, - kgbuffer, - d_data, - sd, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - LightRay_coop, - ISLamp_coop, - BSDFEval_coop, - AOLightRay_coop, - AOBSDF_coop, - AOAlpha_coop, - ray_state, - Queue_data, - Queue_index, - dQueue_size, - use_queues_flag); - - kernel_set_args(program_sum_all_radiance(), - 0, - d_data, - d_buffer, - per_sample_output_buffers, - num_parallel_samples, - d_w, - d_h, - d_stride, - rtile.buffer_offset_x, - rtile.buffer_offset_y, - rtile.buffer_rng_state_stride, - start_sample); - - /* Macro for Enqueuing split kernels. */ -#define GLUE(a, b) a ## b -#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \ - { \ - ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \ - GLUE(program_, \ - kernelName)(), \ - 2, \ - NULL, \ - globalSize, \ - localSize, \ - 0, \ - NULL, \ - NULL); \ - opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \ - if(ciErr != CL_SUCCESS) { \ - string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \ - clewErrorString(ciErr)); \ - opencl_error(message); \ - return; \ - } \ - } (void) 0 + rtile.num_samples, + rtile.buffer); /* Enqueue ckPathTraceKernel_data_init kernel. */ - ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size); - bool activeRaysAvailable = true; - - /* Record number of time host intervention has been made */ - unsigned int numHostIntervention = 0; - unsigned int numNextPathIterTimes = PathIteration_times; - bool canceled = false; - while(activeRaysAvailable) { - /* Twice the global work size of other kernels for - * ckPathTraceKernel_shadow_blocked_direct_lighting. */ - size_t global_size_shadow_blocked[2]; - global_size_shadow_blocked[0] = global_size[0] * 2; - global_size_shadow_blocked[1] = global_size[1]; - - /* Do path-iteration in host [Enqueue Path-iteration kernels. */ - for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) { - ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size); - ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); - - if(task->get_cancel()) { - canceled = true; - break; - } - } - - /* Read ray-state into Host memory to decide if we should exit - * path-iteration in host. - */ - ciErr = clEnqueueReadBuffer(cqCommandQueue, - ray_state, - CL_TRUE, - 0, - global_size[0] * global_size[1] * sizeof(char), - hostRayStateArray, - 0, - NULL, - NULL); - assert(ciErr == CL_SUCCESS); - - activeRaysAvailable = false; - - for(int rayStateIter = 0; - rayStateIter < global_size[0] * global_size[1]; - ++rayStateIter) - { - if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) { - /* Not all rays are RAY_INACTIVE. */ - activeRaysAvailable = true; - break; - } - } - - if(activeRaysAvailable) { - numHostIntervention++; - PathIteration_times = PATH_ITER_INC_FACTOR; - /* Host intervention done before all rays become RAY_INACTIVE; - * Set do more initial iterations for the next tile. - */ - numNextPathIterTimes += PATH_ITER_INC_FACTOR; - } - - if(task->get_cancel()) { - canceled = true; - break; - } - } - - /* Execute SumALLRadiance kernel to accumulate radiance calculated in - * per_sample_output_buffers into RenderTile's output buffer. - */ - if(!canceled) { - size_t sum_all_radiance_local_size[2] = {16, 16}; - size_t sum_all_radiance_global_size[2]; - sum_all_radiance_global_size[0] = - (((d_w - 1) / sum_all_radiance_local_size[0]) + 1) * - sum_all_radiance_local_size[0]; - sum_all_radiance_global_size[1] = - (((d_h - 1) / sum_all_radiance_local_size[1]) + 1) * - sum_all_radiance_local_size[1]; - ENQUEUE_SPLIT_KERNEL(sum_all_radiance, - sum_all_radiance_global_size, - sum_all_radiance_local_size); - } - -#undef ENQUEUE_SPLIT_KERNEL -#undef GLUE - - if(numHostIntervention == 0) { - /* This means that we are executing kernel more than required - * Must avoid this for the next sample/tile. - */ - PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ? - PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR; - } - else { - /* Number of path-iterations done for this tile is set as - * Initial path-iteration times for the next tile - */ - PathIteration_times = numNextPathIterTimes; - } - - first_tile = false; - } - - /* Calculates the amount of memory that has to be always - * allocated in order for the split kernel to function. - * This memory is tile/scene-property invariant (meaning, - * the value returned by this function does not depend - * on the user set tile size or scene properties. - */ - size_t get_invariable_mem_allocated() - { - size_t total_invariable_mem_allocated = 0; - size_t KernelGlobals_size = 0; - - KernelGlobals_size = get_KernelGlobals_size(); - - total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */ - total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */ - total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */ - - return total_invariable_mem_allocated; - } - - /* Calculate the memory that has-to-be/has-been allocated for - * the split kernel to function. - */ - size_t get_tile_specific_mem_allocated(const int2 tile_size) - { - size_t tile_specific_mem_allocated = 0; - - /* Get required tile info */ - unsigned int user_set_tile_w = tile_size.x; - unsigned int user_set_tile_h = tile_size.y; - -#ifdef __WORK_STEALING__ - /* Calculate memory to be allocated for work_pools in - * case of work_stealing. - */ - size_t max_global_size[2]; - size_t max_num_work_pools = 0; - max_global_size[0] = - (((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - max_global_size[1] = - (((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - max_num_work_pools = - (max_global_size[0] * max_global_size[1]) / - (SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y); - tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int); -#endif - - tile_specific_mem_allocated += - user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size; - tile_specific_mem_allocated += - user_set_tile_w * user_set_tile_h * sizeof(RNG); - - return tile_specific_mem_allocated; - } - - /* Calculates the texture memories and KernelData (d_data) memory - * that has been allocated. - */ - size_t get_scene_specific_mem_allocated(cl_mem d_data) - { - size_t scene_specific_mem_allocated = 0; - /* Calculate texture memories. */ -#define KERNEL_TEX(type, ttype, name) \ - scene_specific_mem_allocated += get_tex_size(#name); -#include "kernel_textures.h" -#undef KERNEL_TEX - size_t d_data_size; - ciErr = clGetMemObjectInfo(d_data, - CL_MEM_SIZE, - sizeof(d_data_size), - &d_data_size, - NULL); - assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info"); - scene_specific_mem_allocated += d_data_size; - return scene_specific_mem_allocated; - } - - /* Calculate the memory required for one thread in split kernel. */ - size_t get_per_thread_memory() - { - size_t shaderdata_size = 0; - /* TODO(sergey): This will actually over-allocate if - * particular kernel does not support multiclosure. - */ - shaderdata_size = get_shader_data_size(current_max_closure); - size_t retval = sizeof(RNG) - + sizeof(float3) /* Throughput size */ - + sizeof(float) /* L transparent size */ - + sizeof(char) /* Ray state size */ - + sizeof(unsigned int) /* Work element size */ - + sizeof(int) /* ISLamp_size */ - + sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState) - + sizeof(Intersection) /* Overall isect */ - + sizeof(Intersection) /* Instersection_coop_AO */ - + sizeof(Intersection) /* Intersection coop DL */ - + shaderdata_size /* Overall ShaderData */ - + (shaderdata_size * 2) /* ShaderData : DL and shadow */ - + sizeof(Ray) + sizeof(BsdfEval) - + sizeof(float3) /* AOAlpha size */ - + sizeof(float3) /* AOBSDF size */ - + sizeof(Ray) - + (sizeof(int) * NUM_QUEUES) - + per_thread_output_buffer_size; - return retval; - } - - /* Considers the total memory available in the device and - * and returns the maximum global work size possible. - */ - size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data) - { - /* Calculate invariably allocated memory. */ - size_t invariable_mem_allocated = get_invariable_mem_allocated(); - /* Calculate tile specific allocated memory. */ - size_t tile_specific_mem_allocated = - get_tile_specific_mem_allocated(tile_size); - /* Calculate scene specific allocated memory. */ - size_t scene_specific_mem_allocated = - get_scene_specific_mem_allocated(d_data); - /* Calculate total memory available for the threads in global work size. */ - size_t available_memory = total_allocatable_memory - - invariable_mem_allocated - - tile_specific_mem_allocated - - scene_specific_mem_allocated - - DATA_ALLOCATION_MEM_FACTOR; - size_t per_thread_memory_required = get_per_thread_memory(); - return (available_memory / per_thread_memory_required); - } - - /* Checks if the device has enough memory to render the whole tile; - * If not, we should split single tile into multiple tiles of small size - * and process them all. - */ - bool need_to_split_tile(unsigned int d_w, - unsigned int d_h, - int2 max_render_feasible_tile_size) - { - size_t global_size_estimate[2]; - /* TODO(sergey): Such round-ups are in quite few places, need to replace - * them with an utility macro. - */ - global_size_estimate[0] = - (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - global_size_estimate[1] = - (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - if((global_size_estimate[0] * global_size_estimate[1]) > - (max_render_feasible_tile_size.x * max_render_feasible_tile_size.y)) - { - return true; - } - else { + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + device->program_data_init(), + 2, + NULL, + dim.global_size, + dim.local_size, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + if(device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); return false; } - } - /* Considers the scene properties, global memory available in the device - * and returns a rectanglular tile dimension (approx the maximum) - * that should render on split kernel. - */ - int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size) - { - int2 max_render_feasible_tile_size; - int square_root_val = (int)sqrt(feasible_global_work_size); - max_render_feasible_tile_size.x = square_root_val; - max_render_feasible_tile_size.y = square_root_val; - /* Ciel round-off max_render_feasible_tile_size. */ - int2 ceil_render_feasible_tile_size; - ceil_render_feasible_tile_size.x = - (((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - ceil_render_feasible_tile_size.y = - (((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <= - feasible_global_work_size) - { - return ceil_render_feasible_tile_size; - } - /* Floor round-off max_render_feasible_tile_size. */ - int2 floor_render_feasible_tile_size; - floor_render_feasible_tile_size.x = - (max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) * - SPLIT_KERNEL_LOCAL_SIZE_X; - floor_render_feasible_tile_size.y = - (max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - return floor_render_feasible_tile_size; + return true; } - /* Try splitting the current tile into multiple smaller - * almost-square-tiles. - */ - int2 get_split_tile_size(RenderTile rtile, - int2 max_render_feasible_tile_size) + virtual int2 split_kernel_local_size() { - int2 split_tile_size; - int num_global_threads = max_render_feasible_tile_size.x * - max_render_feasible_tile_size.y; - int d_w = rtile.w; - int d_h = rtile.h; - /* Ceil round off d_w and d_h */ - d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - while(d_w * d_h > num_global_threads) { - /* Halve the longer dimension. */ - if(d_w >= d_h) { - d_w = d_w / 2; - d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - } - else { - d_h = d_h / 2; - d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - } - } - split_tile_size.x = d_w; - split_tile_size.y = d_h; - return split_tile_size; + return make_int2(64, 1); } - /* Splits existing tile into multiple tiles of tile size split_tile_size. */ - vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size) + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/) { - vector<SplitRenderTile> to_path_trace_rtile; - int d_w = rtile.w; - int d_h = rtile.h; - int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1); - int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1); - /* Buffer and rng_state offset calc. */ - size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride); - size_t offset_x = offset_index % rtile.stride; - size_t offset_y = offset_index / rtile.stride; - /* Resize to_path_trace_rtile. */ - to_path_trace_rtile.resize(num_tiles_x * num_tiles_y); - for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) { - for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) { - int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x; - to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x; - to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y; - to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x; - to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y; - to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample; - to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples; - to_path_trace_rtile[rtile_index].sample = rtile.sample; - to_path_trace_rtile[rtile_index].resolution = rtile.resolution; - to_path_trace_rtile[rtile_index].offset = rtile.offset; - to_path_trace_rtile[rtile_index].buffers = rtile.buffers; - to_path_trace_rtile[rtile_index].buffer = rtile.buffer; - to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state; - to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x); - to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y); - to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride; - /* Fill width and height of the new render tile. */ - to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ? - (d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */ - : split_tile_size.x; - to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ? - (d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */ - : split_tile_size.y; - to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w; - } + cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); + /* Use small global size on CPU devices as it seems to be much faster. */ + if(type == CL_DEVICE_TYPE_CPU) { + VLOG(1) << "Global size: (64, 64)."; + return make_int2(64, 64); } - return to_path_trace_rtile; - } - void thread_run(DeviceTask *task) - { - if(task->type == DeviceTask::FILM_CONVERT) { - film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); - } - else if(task->type == DeviceTask::SHADER) { - shader(*task); - } - else if(task->type == DeviceTask::PATH_TRACE) { - RenderTile tile; - bool initialize_data_and_check_render_feasibility = false; - bool need_to_split_tiles_further = false; - int2 max_render_feasible_tile_size; - size_t feasible_global_work_size; - const int2 tile_size = task->requested_tile_size; - /* Keep rendering tiles until done. */ - while(task->acquire_tile(this, tile)) { - if(!initialize_data_and_check_render_feasibility) { - /* Initialize data. */ - /* Calculate per_thread_output_buffer_size. */ - size_t output_buffer_size = 0; - ciErr = clGetMemObjectInfo((cl_mem)tile.buffer, - CL_MEM_SIZE, - sizeof(output_buffer_size), - &output_buffer_size, - NULL); - assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info"); - /* This value is different when running on AMD and NV. */ - if(background) { - /* In offline render the number of buffer elements - * associated with tile.buffer is the current tile size. - */ - per_thread_output_buffer_size = - output_buffer_size / (tile.w * tile.h); - } - else { - /* interactive rendering, unlike offline render, the number of buffer elements - * associated with tile.buffer is the entire viewport size. - */ - per_thread_output_buffer_size = - output_buffer_size / (tile.buffers->params.width * - tile.buffers->params.height); - } - /* Check render feasibility. */ - feasible_global_work_size = get_feasible_global_work_size( - tile_size, - CL_MEM_PTR(const_mem_map["__data"]->device_pointer)); - max_render_feasible_tile_size = - get_max_render_feasible_tile_size( - feasible_global_work_size); - need_to_split_tiles_further = - need_to_split_tile(tile_size.x, - tile_size.y, - max_render_feasible_tile_size); - initialize_data_and_check_render_feasibility = true; - } - if(need_to_split_tiles_further) { - int2 split_tile_size = - get_split_tile_size(tile, - max_render_feasible_tile_size); - vector<SplitRenderTile> to_path_trace_render_tiles = - split_tiles(tile, split_tile_size); - /* Print message to console */ - if(background && (to_path_trace_render_tiles.size() > 1)) { - fprintf(stderr, "Message : Tiles need to be split " - "further inside path trace (due to insufficient " - "device-global-memory for split kernel to " - "function) \n" - "The current tile of dimensions %dx%d is split " - "into tiles of dimension %dx%d for render \n", - tile.w, tile.h, - split_tile_size.x, - split_tile_size.y); - } - /* Process all split tiles. */ - for(int tile_iter = 0; - tile_iter < to_path_trace_render_tiles.size(); - ++tile_iter) - { - path_trace(task, - to_path_trace_render_tiles[tile_iter], - max_render_feasible_tile_size); - } - } - else { - /* No splitting required; process the entire tile at once. */ - /* Render feasible tile size is user-set-tile-size itself. */ - max_render_feasible_tile_size.x = - (((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - max_render_feasible_tile_size.y = - (((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - /* buffer_rng_state_stride is stride itself. */ - SplitRenderTile split_tile(tile); - split_tile.buffer_rng_state_stride = tile.stride; - path_trace(task, split_tile, max_render_feasible_tile_size); - } - tile.sample = tile.start_sample + tile.num_samples; + cl_ulong max_buffer_size; + clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); + VLOG(1) << "Maximum device allocation size: " + << string_human_readable_number(max_buffer_size) << " bytes. (" + << string_human_readable_size(max_buffer_size) << ")."; - /* Complete kernel execution before release tile. */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); - - task->release_tile(tile); - } - } - } - -protected: - cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE) - { - cl_mem ptr; - assert(bufsize != 0); - ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer"); - return ptr; + size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2); + int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements)); + VLOG(1) << "Global size: " << global_size << "."; + return global_size; } +}; - /* ** Those guys are for workign around some compiler-specific bugs ** */ +OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_) +: OpenCLDeviceBase(info, stats, background_) +{ + split_kernel = new OpenCLSplitKernel(this); - string build_options_for_base_program( - const DeviceRequestedFeatures& requested_features) - { - return requested_features.get_build_options(); - } -}; + background = background_; +} Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background) { diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index 82e1640e508..8128fcee09b 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -16,11 +16,12 @@ #ifdef WITH_OPENCL -#include "opencl.h" +#include "device/opencl/opencl.h" -#include "util_logging.h" -#include "util_path.h" -#include "util_time.h" +#include "util/util_logging.h" +#include "util/util_md5.h" +#include "util/util_path.h" +#include "util/util_time.h" using std::cerr; using std::endl; @@ -234,7 +235,7 @@ string OpenCLCache::get_kernel_md5() thread_scoped_lock lock(self.kernel_md5_lock); if(self.kernel_md5.empty()) { - self.kernel_md5 = path_files_md5_hash(path_get("kernel")); + self.kernel_md5 = path_files_md5_hash(path_get("source")); } return self.kernel_md5; } @@ -309,6 +310,8 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src) string build_options; build_options = device->kernel_build_options(debug_src) + kernel_build_options; + VLOG(1) << "Build options passed to clBuildProgram: '" + << build_options << "'."; cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL); /* show warnings even if build is successful */ @@ -336,12 +339,13 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src) bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src) { - string source = "#include \"kernels/opencl/" + kernel_file + "\" // " + OpenCLCache::get_kernel_md5() + "\n"; + string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n"; /* We compile kernels consisting of many files. unfortunately OpenCL * kernel caches do not seem to recognize changes in included files. * so we force recompile on changes by adding the md5 hash of all files. */ - source = path_source_replace_includes(source, path_get("kernel")); + source = path_source_replace_includes(source, path_get("source")); + source += "\n// " + util_md5_string(source) + "\n"; if(debug_src) { path_write_text(*debug_src, source); @@ -352,10 +356,10 @@ bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src) cl_int ciErr; program = clCreateProgramWithSource(device->cxContext, - 1, - &source_str, - &source_len, - &ciErr); + 1, + &source_str, + &source_len, + &ciErr); if(ciErr != CL_SUCCESS) { add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr)); @@ -438,7 +442,11 @@ void OpenCLDeviceBase::OpenCLProgram::load() if(!program) { add_log(string("OpenCL program ") + program_name + " not found in cache.", true); - string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + OpenCLCache::get_kernel_md5(); + /* need to create source to get md5 */ + string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n"; + source = path_source_replace_includes(source, path_get("source")); + + string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source); basename = path_cache_get(path_join("kernels", basename)); string clbin = basename + ".clbin"; @@ -544,6 +552,11 @@ bool OpenCLInfo::use_debug() return DebugFlags().opencl.debug; } +bool OpenCLInfo::use_single_program() +{ + return DebugFlags().opencl.single_program; +} + bool OpenCLInfo::kernel_use_advanced_shading(const string& platform) { /* keep this in sync with kernel_types.h! */ @@ -587,11 +600,20 @@ bool OpenCLInfo::device_supported(const string& platform_name, const cl_device_id device_id) { cl_device_type device_type; - clGetDeviceInfo(device_id, - CL_DEVICE_TYPE, - sizeof(cl_device_type), - &device_type, - NULL); + if(!get_device_type(device_id, &device_type)) { + return false; + } + string device_name; + if(!get_device_name(device_id, &device_name)) { + return false; + } + /* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework + * (aka, it will not be on Intel framework). This isn't supported + * and needs an explicit blacklist. + */ + if(strstr(device_name.c_str(), "Iris")) { + return false; + } if(platform_name == "AMD Accelerated Parallel Processing" && device_type == CL_DEVICE_TYPE_GPU) { @@ -705,39 +727,30 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices return; } + cl_int error; vector<cl_device_id> device_ids; - cl_uint num_devices = 0; vector<cl_platform_id> platform_ids; - cl_uint num_platforms = 0; - /* Get devices. */ - if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS || - num_platforms == 0) - { - FIRST_VLOG(2) << "No OpenCL platforms were found."; + /* Get platforms. */ + if(!get_platforms(&platform_ids, &error)) { + FIRST_VLOG(2) << "Error fetching platforms:" + << string(clewErrorString(error)); first_time = false; return; } - platform_ids.resize(num_platforms); - if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) { - FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver.."; + if(platform_ids.size() == 0) { + FIRST_VLOG(2) << "No OpenCL platforms were found."; first_time = false; return; } /* Devices are numbered consecutively across platforms. */ - for(int platform = 0; platform < num_platforms; platform++) { + for(int platform = 0; platform < platform_ids.size(); platform++) { cl_platform_id platform_id = platform_ids[platform]; - char pname[256]; - if(clGetPlatformInfo(platform_id, - CL_PLATFORM_NAME, - sizeof(pname), - &pname, - NULL) != CL_SUCCESS) - { + string platform_name; + if(!get_platform_name(platform_id, &platform_name)) { FIRST_VLOG(2) << "Failed to get platform name, ignoring."; continue; } - string platform_name = pname; FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << "."; if(!platform_version_check(platform_id)) { @@ -745,39 +758,28 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices << " due to too old compiler version."; continue; } - num_devices = 0; - cl_int ciErr; - if((ciErr = clGetDeviceIDs(platform_id, - device_type, - 0, - NULL, - &num_devices)) != CL_SUCCESS || num_devices == 0) + if(!get_platform_devices(platform_id, + device_type, + &device_ids, + &error)) { FIRST_VLOG(2) << "Ignoring platform " << platform_name - << ", failed to fetch number of devices: " << string(clewErrorString(ciErr)); + << ", failed to fetch of devices: " + << string(clewErrorString(error)); continue; } - device_ids.resize(num_devices); - if(clGetDeviceIDs(platform_id, - device_type, - num_devices, - &device_ids[0], - NULL) != CL_SUCCESS) - { + if(device_ids.size() == 0) { FIRST_VLOG(2) << "Ignoring platform " << platform_name - << ", failed to fetch devices list."; + << ", it has no devices."; continue; } - for(int num = 0; num < num_devices; num++) { - cl_device_id device_id = device_ids[num]; - char device_name[1024] = "\0"; - if(clGetDeviceInfo(device_id, - CL_DEVICE_NAME, - sizeof(device_name), - &device_name, - NULL) != CL_SUCCESS) - { - FIRST_VLOG(2) << "Failed to fetch device name, ignoring."; + for(int num = 0; num < device_ids.size(); num++) { + const cl_device_id device_id = device_ids[num]; + string device_name; + if(!get_device_name(device_id, &device_name, &error)) { + FIRST_VLOG(2) << "Failed to fetch device name: " + << string(clewErrorString(error)) + << ", ignoring."; continue; } if(!device_version_check(device_id)) { @@ -789,24 +791,28 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices device_supported(platform_name, device_id)) { cl_device_type device_type; - if(clGetDeviceInfo(device_id, - CL_DEVICE_TYPE, - sizeof(cl_device_type), - &device_type, - NULL) != CL_SUCCESS) - { + if(!get_device_type(device_id, &device_type, &error)) { FIRST_VLOG(2) << "Ignoring device " << device_name - << ", failed to fetch device type."; + << ", failed to fetch device type:" + << string(clewErrorString(error)); continue; } - FIRST_VLOG(2) << "Adding new device " << device_name << "."; + string readable_device_name = + get_readable_device_name(device_id); + if(readable_device_name != device_name) { + FIRST_VLOG(2) << "Using more readable device name: " + << readable_device_name; + } + FIRST_VLOG(2) << "Adding new device " + << readable_device_name << "."; string hardware_id = get_hardware_id(platform_name, device_id); - usable_devices->push_back(OpenCLPlatformDevice(platform_id, - platform_name, - device_id, - device_type, - device_name, - hardware_id)); + usable_devices->push_back(OpenCLPlatformDevice( + platform_id, + platform_name, + device_id, + device_type, + readable_device_name, + hardware_id)); } else { FIRST_VLOG(2) << "Ignoring device " << device_name @@ -817,6 +823,252 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices first_time = false; } +bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, + cl_int *error) +{ + /* Reset from possible previous state. */ + platform_ids->resize(0); + cl_uint num_platforms; + if(!get_num_platforms(&num_platforms, error)) { + return false; + } + /* Get actual platforms. */ + cl_int err; + platform_ids->resize(num_platforms); + if((err = clGetPlatformIDs(num_platforms, + &platform_ids->at(0), + NULL)) != CL_SUCCESS) { + if(error != NULL) { + *error = err; + } + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + return true; +} + +vector<cl_platform_id> OpenCLInfo::get_platforms() +{ + vector<cl_platform_id> platform_ids; + get_platforms(&platform_ids); + return platform_ids; +} + +bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error) +{ + cl_int err; + if((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) { + if(error != NULL) { + *error = err; + } + *num_platforms = 0; + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + return true; +} + +cl_uint OpenCLInfo::get_num_platforms() +{ + cl_uint num_platforms; + if(!get_num_platforms(&num_platforms)) { + return 0; + } + return num_platforms; +} + +bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, + string *platform_name) +{ + char buffer[256]; + if(clGetPlatformInfo(platform_id, + CL_PLATFORM_NAME, + sizeof(buffer), + &buffer, + NULL) != CL_SUCCESS) + { + *platform_name = ""; + return false; + } + *platform_name = buffer; + return true; +} + +string OpenCLInfo::get_platform_name(cl_platform_id platform_id) +{ + string platform_name; + if (!get_platform_name(platform_id, &platform_name)) { + return ""; + } + return platform_name; +} + +bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + cl_uint *num_devices, + cl_int *error) +{ + cl_int err; + if((err = clGetDeviceIDs(platform_id, + device_type, + 0, + NULL, + num_devices)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + *num_devices = 0; + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + return true; +} + +cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, + cl_device_type device_type) +{ + cl_uint num_devices; + if(!get_num_platform_devices(platform_id, + device_type, + &num_devices)) + { + return 0; + } + return num_devices; +} + +bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + vector<cl_device_id> *device_ids, + cl_int* error) +{ + /* Reset from possible previous state. */ + device_ids->resize(0); + /* Get number of devices to pre-allocate memory. */ + cl_uint num_devices; + if(!get_num_platform_devices(platform_id, + device_type, + &num_devices, + error)) + { + return false; + } + /* Get actual device list. */ + device_ids->resize(num_devices); + cl_int err; + if((err = clGetDeviceIDs(platform_id, + device_type, + num_devices, + &device_ids->at(0), + NULL)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + return true; +} + +vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type) +{ + vector<cl_device_id> devices; + get_platform_devices(platform_id, device_type, &devices); + return devices; +} + +bool OpenCLInfo::get_device_name(cl_device_id device_id, + string *device_name, + cl_int* error) +{ + char buffer[1024]; + cl_int err; + if((err = clGetDeviceInfo(device_id, + CL_DEVICE_NAME, + sizeof(buffer), + &buffer, + NULL)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + *device_name = ""; + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + *device_name = buffer; + return true; +} + +string OpenCLInfo::get_device_name(cl_device_id device_id) +{ + string device_name; + if(!get_device_name(device_id, &device_name)) { + return ""; + } + return device_name; +} + +bool OpenCLInfo::get_device_type(cl_device_id device_id, + cl_device_type *device_type, + cl_int* error) +{ + cl_int err; + if((err = clGetDeviceInfo(device_id, + CL_DEVICE_TYPE, + sizeof(cl_device_type), + device_type, + NULL)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + *device_type = 0; + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + return true; +} + +cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id) +{ + cl_device_type device_type; + if(!get_device_type(device_id, &device_type)) { + return 0; + } + return device_type; +} + +string OpenCLInfo::get_readable_device_name(cl_device_id device_id) +{ + char board_name[1024]; + if(clGetDeviceInfo(device_id, + CL_DEVICE_BOARD_NAME_AMD, + sizeof(board_name), + &board_name, + NULL) == CL_SUCCESS) + { + return board_name; + } + /* Fallback to standard device name API. */ + return get_device_name(device_id); +} + CCL_NAMESPACE_END #endif diff --git a/intern/cycles/graph/CMakeLists.txt b/intern/cycles/graph/CMakeLists.txt index 4ea18728f1c..e70a18137bd 100644 --- a/intern/cycles/graph/CMakeLists.txt +++ b/intern/cycles/graph/CMakeLists.txt @@ -1,7 +1,6 @@ set(INC - . - ../util + .. ) set(SRC diff --git a/intern/cycles/graph/node.cpp b/intern/cycles/graph/node.cpp index 3c228a716d5..10d91a1e4ef 100644 --- a/intern/cycles/graph/node.cpp +++ b/intern/cycles/graph/node.cpp @@ -14,12 +14,12 @@ * limitations under the License. */ -#include "node.h" -#include "node_type.h" +#include "graph/node.h" +#include "graph/node_type.h" -#include "util_foreach.h" -#include "util_param.h" -#include "util_transform.h" +#include "util/util_foreach.h" +#include "util/util_param.h" +#include "util/util_transform.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/graph/node.h b/intern/cycles/graph/node.h index 64410f4539b..53425f5faf1 100644 --- a/intern/cycles/graph/node.h +++ b/intern/cycles/graph/node.h @@ -16,11 +16,11 @@ #pragma once -#include "node_type.h" +#include "graph/node_type.h" -#include "util_map.h" -#include "util_param.h" -#include "util_vector.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/graph/node_enum.h b/intern/cycles/graph/node_enum.h index 2bae531c036..4e40c294f4f 100644 --- a/intern/cycles/graph/node_enum.h +++ b/intern/cycles/graph/node_enum.h @@ -16,8 +16,8 @@ #pragma once -#include "util_map.h" -#include "util_param.h" +#include "util/util_map.h" +#include "util/util_param.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/graph/node_type.cpp b/intern/cycles/graph/node_type.cpp index 5b98de778ad..a3a8fa5f382 100644 --- a/intern/cycles/graph/node_type.cpp +++ b/intern/cycles/graph/node_type.cpp @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "node_type.h" -#include "util_foreach.h" -#include "util_transform.h" +#include "graph/node_type.h" +#include "util/util_foreach.h" +#include "util/util_transform.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/graph/node_type.h b/intern/cycles/graph/node_type.h index 1fb135f6d22..7d46e31ce24 100644 --- a/intern/cycles/graph/node_type.h +++ b/intern/cycles/graph/node_type.h @@ -16,12 +16,12 @@ #pragma once -#include "node_enum.h" +#include "graph/node_enum.h" -#include "util_map.h" -#include "util_param.h" -#include "util_string.h" -#include "util_vector.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_string.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/graph/node_xml.cpp b/intern/cycles/graph/node_xml.cpp index 590e09645ed..aad2740ffc0 100644 --- a/intern/cycles/graph/node_xml.cpp +++ b/intern/cycles/graph/node_xml.cpp @@ -14,11 +14,11 @@ * limitations under the License. */ -#include "node_xml.h" +#include "graph/node_xml.h" -#include "util_foreach.h" -#include "util_string.h" -#include "util_transform.h" +#include "util/util_foreach.h" +#include "util/util_string.h" +#include "util/util_transform.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/graph/node_xml.h b/intern/cycles/graph/node_xml.h index 7494c5e6e55..63e80bf79f2 100644 --- a/intern/cycles/graph/node_xml.h +++ b/intern/cycles/graph/node_xml.h @@ -16,11 +16,11 @@ #pragma once -#include "node.h" +#include "graph/node.h" -#include "util_map.h" -#include "util_string.h" -#include "util_xml.h" +#include "util/util_map.h" +#include "util/util_string.h" +#include "util/util_xml.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 5f3ceb0f864..c3772dfa2d8 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -1,10 +1,7 @@ remove_extra_strict_flags() set(INC - . - ../util - osl - svm + .. ) set(INC_SYS @@ -13,19 +10,28 @@ set(INC_SYS set(SRC kernels/cpu/kernel.cpp + kernels/cpu/kernel_split.cpp kernels/opencl/kernel.cl + kernels/opencl/kernel_state_buffer_size.cl + kernels/opencl/kernel_split.cl kernels/opencl/kernel_data_init.cl + kernels/opencl/kernel_path_init.cl kernels/opencl/kernel_queue_enqueue.cl kernels/opencl/kernel_scene_intersect.cl kernels/opencl/kernel_lamp_emission.cl - kernels/opencl/kernel_background_buffer_update.cl + kernels/opencl/kernel_do_volume.cl + kernels/opencl/kernel_indirect_background.cl kernels/opencl/kernel_shader_eval.cl kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl + kernels/opencl/kernel_subsurface_scatter.cl kernels/opencl/kernel_direct_lighting.cl - kernels/opencl/kernel_shadow_blocked.cl + kernels/opencl/kernel_shadow_blocked_ao.cl + kernels/opencl/kernel_shadow_blocked_dl.cl kernels/opencl/kernel_next_iteration_setup.cl - kernels/opencl/kernel_sum_all_radiance.cl + kernels/opencl/kernel_indirect_subsurface.cl + kernels/opencl/kernel_buffer_update.cl kernels/cuda/kernel.cu + kernels/cuda/kernel_split.cu ) set(SRC_BVH_HEADERS @@ -68,6 +74,7 @@ set(SRC_HEADERS kernel_path_common.h kernel_path_state.h kernel_path_surface.h + kernel_path_subsurface.h kernel_path_volume.h kernel_projection.h kernel_queues.h @@ -88,6 +95,10 @@ set(SRC_KERNELS_CPU_HEADERS kernels/cpu/kernel_cpu_image.h ) +set(SRC_KERNELS_CUDA_HEADERS + kernels/cuda/kernel_config.h +) + set(SRC_CLOSURE_HEADERS closure/alloc.h closure/bsdf.h @@ -184,6 +195,7 @@ set(SRC_UTIL_HEADERS ../util/util_hash.h ../util/util_math.h ../util/util_math_fast.h + ../util/util_math_intersect.h ../util/util_static_assert.h ../util/util_transform.h ../util/util_texture.h @@ -191,17 +203,25 @@ set(SRC_UTIL_HEADERS ) set(SRC_SPLIT_HEADERS - split/kernel_background_buffer_update.h + split/kernel_buffer_update.h split/kernel_data_init.h split/kernel_direct_lighting.h + split/kernel_do_volume.h split/kernel_holdout_emission_blurring_pathtermination_ao.h + split/kernel_indirect_background.h + split/kernel_indirect_subsurface.h split/kernel_lamp_emission.h split/kernel_next_iteration_setup.h + split/kernel_path_init.h + split/kernel_queue_enqueue.h split/kernel_scene_intersect.h split/kernel_shader_eval.h - split/kernel_shadow_blocked.h + split/kernel_shadow_blocked_ao.h + split/kernel_shadow_blocked_dl.h split/kernel_split_common.h - split/kernel_sum_all_radiance.h + split/kernel_split_data.h + split/kernel_split_data_types.h + split/kernel_subsurface_scatter.h ) # CUDA module @@ -229,8 +249,9 @@ if(WITH_CYCLES_CUDA_BINARIES) endif() # build for each arch - set(cuda_sources kernels/cuda/kernel.cu + set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu ${SRC_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} ${SRC_BVH_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} @@ -239,15 +260,22 @@ if(WITH_CYCLES_CUDA_BINARIES) ) set(cuda_cubins) - macro(CYCLES_CUDA_KERNEL_ADD arch experimental) - if(${experimental}) - set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__") - set(cuda_cubin kernel_experimental_${arch}.cubin) + macro(CYCLES_CUDA_KERNEL_ADD arch split experimental) + if(${split}) + set(cuda_extra_flags "-D__SPLIT__") + set(cuda_cubin kernel_split) else() set(cuda_extra_flags "") - set(cuda_cubin kernel_${arch}.cubin) + set(cuda_cubin kernel) endif() + if(${experimental}) + set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__) + set(cuda_cubin ${cuda_cubin}_experimental) + endif() + + set(cuda_cubin ${cuda_cubin}_${arch}.cubin) + if(WITH_CYCLES_DEBUG) set(cuda_debug_flags "-D__KERNEL_DEBUG__") else() @@ -260,13 +288,19 @@ if(WITH_CYCLES_CUDA_BINARIES) set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}") set(cuda_math_flags "--use_fast_math") + if(split) + set(cuda_kernel_src "/kernels/cuda/kernel_split.cu") + else() + set(cuda_kernel_src "/kernels/cuda/kernel.cu") + endif() + add_custom_command( OUTPUT ${cuda_cubin} COMMAND ${cuda_nvcc_command} -arch=${arch} ${CUDA_NVCC_FLAGS} -m${CUDA_BITS} - --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu + --cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src} -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" ${cuda_arch_flags} @@ -274,8 +308,7 @@ if(WITH_CYCLES_CUDA_BINARIES) ${cuda_math_flags} ${cuda_extra_flags} ${cuda_debug_flags} - -I${CMAKE_CURRENT_SOURCE_DIR}/../util - -I${CMAKE_CURRENT_SOURCE_DIR}/svm + -I${CMAKE_CURRENT_SOURCE_DIR}/.. -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC @@ -293,7 +326,12 @@ if(WITH_CYCLES_CUDA_BINARIES) foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) # Compile regular kernel - CYCLES_CUDA_KERNEL_ADD(${arch} FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE) + + if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES) + # Compile split kernel + CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE) + endif() endforeach() add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins}) @@ -311,36 +349,50 @@ endif() include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) +set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") +set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") + if(CXX_HAS_SSE) list(APPEND SRC kernels/cpu/kernel_sse2.cpp kernels/cpu/kernel_sse3.cpp kernels/cpu/kernel_sse41.cpp + kernels/cpu/kernel_split_sse2.cpp + kernels/cpu/kernel_split_sse3.cpp + kernels/cpu/kernel_split_sse41.cpp ) set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX) list(APPEND SRC kernels/cpu/kernel_avx.cpp + kernels/cpu/kernel_split_avx.cpp ) set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX2) list(APPEND SRC kernels/cpu/kernel_avx2.cpp + kernels/cpu/kernel_split_avx2.cpp ) set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_KERNELS_CPU_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS} @@ -362,24 +414,33 @@ endif() #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED}) #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/split) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_state_buffer_size.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_path_init.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_do_volume.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_background.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_scatter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_dl.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/split) diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index 36798982653..85741016b25 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -27,43 +27,43 @@ CCL_NAMESPACE_BEGIN -#include "bvh_types.h" +#include "kernel/bvh/bvh_types.h" /* Common QBVH functions. */ #ifdef __QBVH__ -# include "qbvh_nodes.h" +# include "kernel/bvh/qbvh_nodes.h" #endif /* Regular BVH traversal */ -#include "bvh_nodes.h" +#include "kernel/bvh/bvh_nodes.h" #define BVH_FUNCTION_NAME bvh_intersect #define BVH_FUNCTION_FEATURES 0 -#include "bvh_traversal.h" +#include "kernel/bvh/bvh_traversal.h" #if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "bvh_traversal.h" +# include "kernel/bvh/bvh_traversal.h" #endif #if defined(__HAIR__) # define BVH_FUNCTION_NAME bvh_intersect_hair # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH -# include "bvh_traversal.h" +# include "kernel/bvh/bvh_traversal.h" #endif #if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -# include "bvh_traversal.h" +# include "kernel/bvh/bvh_traversal.h" #endif #if defined(__HAIR__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_hair_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION -# include "bvh_traversal.h" +# include "kernel/bvh/bvh_traversal.h" #endif /* Subsurface scattering BVH traversal */ @@ -71,12 +71,12 @@ CCL_NAMESPACE_BEGIN #if defined(__SUBSURFACE__) # define BVH_FUNCTION_NAME bvh_intersect_subsurface # define BVH_FUNCTION_FEATURES BVH_HAIR -# include "bvh_subsurface.h" +# include "kernel/bvh/bvh_subsurface.h" # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion # define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR -# include "bvh_subsurface.h" +# include "kernel/bvh/bvh_subsurface.h" # endif #endif /* __SUBSURFACE__ */ @@ -85,18 +85,18 @@ CCL_NAMESPACE_BEGIN #if defined(__VOLUME__) # define BVH_FUNCTION_NAME bvh_intersect_volume # define BVH_FUNCTION_FEATURES BVH_HAIR -# include "bvh_volume.h" +# include "kernel/bvh/bvh_volume.h" # if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_volume_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR -# include "bvh_volume.h" +# include "kernel/bvh/bvh_volume.h" # endif # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR -# include "bvh_volume.h" +# include "kernel/bvh/bvh_volume.h" # endif #endif /* __VOLUME__ */ @@ -105,30 +105,30 @@ CCL_NAMESPACE_BEGIN #if defined(__SHADOW_RECORD_ALL__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all # define BVH_FUNCTION_FEATURES 0 -# include "bvh_shadow_all.h" +# include "kernel/bvh/bvh_shadow_all.h" # if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "bvh_shadow_all.h" +# include "kernel/bvh/bvh_shadow_all.h" # endif # if defined(__HAIR__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR -# include "bvh_shadow_all.h" +# include "kernel/bvh/bvh_shadow_all.h" # endif # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -# include "bvh_shadow_all.h" +# include "kernel/bvh/bvh_shadow_all.h" # endif # if defined(__HAIR__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION -# include "bvh_shadow_all.h" +# include "kernel/bvh/bvh_shadow_all.h" # endif #endif /* __SHADOW_RECORD_ALL__ */ @@ -137,18 +137,18 @@ CCL_NAMESPACE_BEGIN #if defined(__VOLUME_RECORD_ALL__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all # define BVH_FUNCTION_FEATURES BVH_HAIR -# include "bvh_volume_all.h" +# include "kernel/bvh/bvh_volume_all.h" # if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR -# include "bvh_volume_all.h" +# include "kernel/bvh/bvh_volume_all.h" # endif # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR -# include "bvh_volume_all.h" +# include "kernel/bvh/bvh_volume_all.h" # endif #endif /* __VOLUME_RECORD_ALL__ */ @@ -202,8 +202,9 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg, } #ifdef __SUBSURFACE__ +/* Note: ray is passed by value to work around a possible CUDA compiler bug. */ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg, - const Ray *ray, + const Ray ray, SubsurfaceIntersection *ss_isect, int subsurface_object, uint *lcg_state, @@ -212,7 +213,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg, #ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { return bvh_intersect_subsurface_motion(kg, - ray, + &ray, ss_isect, subsurface_object, lcg_state, @@ -220,7 +221,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg, } #endif /* __OBJECT_MOTION__ */ return bvh_intersect_subsurface(kg, - ray, + &ray, ss_isect, subsurface_object, lcg_state, @@ -229,30 +230,63 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg, #endif #ifdef __SHADOW_RECORD_ALL__ -ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits) +ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + int skip_object, + uint max_hits, + uint *num_hits) { # ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { # ifdef __HAIR__ - if(kernel_data.bvh.have_curves) - return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, max_hits, num_hits); + if(kernel_data.bvh.have_curves) { + return bvh_intersect_shadow_all_hair_motion(kg, + ray, + isect, + skip_object, + max_hits, + num_hits); + } # endif /* __HAIR__ */ - return bvh_intersect_shadow_all_motion(kg, ray, isect, max_hits, num_hits); + return bvh_intersect_shadow_all_motion(kg, + ray, + isect, + skip_object, + max_hits, + num_hits); } # endif /* __OBJECT_MOTION__ */ # ifdef __HAIR__ - if(kernel_data.bvh.have_curves) - return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits); + if(kernel_data.bvh.have_curves) { + return bvh_intersect_shadow_all_hair(kg, + ray, + isect, + skip_object, + max_hits, + num_hits); + } # endif /* __HAIR__ */ # ifdef __INSTANCING__ - if(kernel_data.bvh.have_instancing) - return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits); + if(kernel_data.bvh.have_instancing) { + return bvh_intersect_shadow_all_instancing(kg, + ray, + isect, + skip_object, + max_hits, + num_hits); + } # endif /* __INSTANCING__ */ - return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits); + return bvh_intersect_shadow_all(kg, + ray, + isect, + skip_object, + max_hits, + num_hits); } #endif /* __SHADOW_RECORD_ALL__ */ @@ -357,7 +391,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng) #endif } -#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__) +#if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__)) /* ToDo: Move to another file? */ ccl_device int intersections_compare(const void *a, const void *b) { @@ -373,5 +407,28 @@ ccl_device int intersections_compare(const void *a, const void *b) } #endif -CCL_NAMESPACE_END +#if defined(__SHADOW_RECORD_ALL__) +ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits) +{ +#ifdef __KERNEL_GPU__ + /* Use bubble sort which has more friendly memory pattern on GPU. */ + bool swapped; + do { + swapped = false; + for(int j = 0; j < num_hits - 1; ++j) { + if(hits[j].t > hits[j + 1].t) { + struct Intersection tmp = hits[j]; + hits[j] = hits[j + 1]; + hits[j + 1] = tmp; + swapped = true; + } + } + --num_hits; + } while(swapped); +#else + qsort(hits, num_hits, sizeof(Intersection), intersections_compare); +#endif +} +#endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */ +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h index 726bef1794c..74a9ebf14e4 100644 --- a/intern/cycles/kernel/bvh/bvh_nodes.h +++ b/intern/cycles/kernel/bvh/bvh_nodes.h @@ -17,8 +17,8 @@ // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and // 3-vector which might be faster. ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg, - int node_addr, - int child) + int node_addr, + int child) { Transform space; const int child_addr = node_addr + child * 3; @@ -31,12 +31,12 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k #if !defined(__KERNEL_SSE2__) ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, - const float3 P, - const float3 idir, - const float t, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 idir, + const float t, + const int node_addr, + const uint visibility, + float dist[2]) { /* fetch node data */ @@ -78,14 +78,14 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, } ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, - const float3 P, - const float3 idir, - const float t, - const float difl, - const float extmax, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) { /* fetch node data */ @@ -203,13 +203,13 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust( } ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, - const float3 P, - const float3 dir, - const float3 idir, - const float t, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const int node_addr, + const uint visibility, + float dist[2]) { int mask = 0; float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); @@ -233,15 +233,15 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, } ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, - const float3 P, - const float3 dir, - const float3 idir, - const float t, - const float difl, - const float extmax, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) { int mask = 0; float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); @@ -265,13 +265,13 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg } ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, - const float3 P, - const float3 dir, - const float3 idir, - const float t, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const int node_addr, + const uint visibility, + float dist[2]) { float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { @@ -296,15 +296,15 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, } ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, - const float3 P, - const float3 dir, - const float3 idir, - const float t, - const float difl, - const float extmax, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) { float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { @@ -442,19 +442,19 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust( } ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, - const float3 P, - const float3 dir, - const ssef& isect_near, - const ssef& isect_far, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const ssef& isect_near, + const ssef& isect_far, + const int node_addr, + const uint visibility, + float dist[2]) { Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); float3 aligned_dir0 = transform_direction(&space0, dir), - aligned_dir1 = transform_direction(&space1, dir);; + aligned_dir1 = transform_direction(&space1, dir); float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P); float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), @@ -503,20 +503,20 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, } ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, - const float3 P, - const float3 dir, - const ssef& isect_near, - const ssef& isect_far, - const float difl, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const ssef& isect_near, + const ssef& isect_far, + const float difl, + const int node_addr, + const uint visibility, + float dist[2]) { Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); float3 aligned_dir0 = transform_direction(&space0, dir), - aligned_dir1 = transform_direction(&space1, dir);; + aligned_dir1 = transform_direction(&space1, dir); float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P); float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), @@ -574,17 +574,17 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg } ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, - const float3& P, - const float3& dir, - const ssef& isect_near, - const ssef& isect_far, - const ssef& tsplat, - const ssef Psplat[3], - const ssef idirsplat[3], - const shuffle_swap_t shufflexyz[3], - const int node_addr, - const uint visibility, - float dist[2]) + const float3& P, + const float3& dir, + const ssef& isect_near, + const ssef& isect_far, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const int node_addr, + const uint visibility, + float dist[2]) { float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { @@ -612,19 +612,19 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, } ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, - const float3& P, - const float3& dir, - const ssef& isect_near, - const ssef& isect_far, - const ssef& tsplat, - const ssef Psplat[3], - const ssef idirsplat[3], - const shuffle_swap_t shufflexyz[3], - const float difl, - const float extmax, - const int node_addr, - const uint visibility, - float dist[2]) + const float3& P, + const float3& dir, + const ssef& isect_near, + const ssef& isect_far, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) { float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index df33a86bb18..267e098f912 100644 --- a/intern/cycles/kernel/bvh/bvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -18,7 +18,7 @@ */ #ifdef __QBVH__ -# include "qbvh_shadow_all.h" +# include "kernel/bvh/qbvh_shadow_all.h" #endif #if BVH_FEATURE(BVH_HAIR) @@ -45,6 +45,7 @@ ccl_device_inline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, + const int skip_object, const uint max_hits, uint *num_hits) { @@ -100,9 +101,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif /* __KERNEL_SSE2__ */ - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* traversal loop */ do { do { @@ -189,6 +187,16 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, while(prim_addr < prim_addr2) { kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); +#ifdef __SHADOW_TRICKS__ + uint tri_object = (object == OBJECT_NONE) + ? kernel_tex_fetch(__prim_object, prim_addr) + : object; + if(tri_object == skip_object) { + ++prim_addr; + continue; + } +#endif + bool hit; /* todo: specialized intersect functions which don't fill in @@ -198,9 +206,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, switch(p_type) { case PRIMITIVE_TRIANGLE: { hit = triangle_intersect(kg, - &isect_precalc, isect_array, P, + dir, PATH_RAY_SHADOW, object, prim_addr); @@ -309,12 +317,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, object = kernel_tex_fetch(__prim_object, -prim_addr-1); # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif - triangle_intersect_precalc(dir, &isect_precalc); num_hits_in_instance = 0; isect_array->t = isect_t; @@ -354,22 +361,17 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); # endif - triangle_intersect_precalc(dir, &isect_precalc); - /* scale isect->t to adjust for instancing */ for(int i = 0; i < num_hits_in_instance; i++) { (isect_array-i-1)->t *= t_fac; } } else { - float ignore_t = FLT_MAX; - # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif - triangle_intersect_precalc(dir, &isect_precalc); } isect_t = tmax; @@ -400,6 +402,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, + const int skip_object, const uint max_hits, uint *num_hits) { @@ -408,6 +411,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, return BVH_FUNCTION_FULL_NAME(QBVH)(kg, ray, isect_array, + skip_object, max_hits, num_hits); } @@ -418,6 +422,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, return BVH_FUNCTION_FULL_NAME(BVH)(kg, ray, isect_array, + skip_object, max_hits, num_hits); } diff --git a/intern/cycles/kernel/bvh/bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_subsurface.h index 889bbca21e2..bda7e34907a 100644 --- a/intern/cycles/kernel/bvh/bvh_subsurface.h +++ b/intern/cycles/kernel/bvh/bvh_subsurface.h @@ -18,7 +18,7 @@ */ #ifdef __QBVH__ -# include "qbvh_subsurface.h" +# include "kernel/bvh/qbvh_subsurface.h" #endif #if BVH_FEATURE(BVH_HAIR) @@ -75,16 +75,16 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { #if BVH_FEATURE(BVH_MOTION) Transform ob_itfm; - bvh_instance_motion_push(kg, - subsurface_object, - ray, - &P, - &dir, - &idir, - &isect_t, - &ob_itfm); + isect_t = bvh_instance_motion_push(kg, + subsurface_object, + ray, + &P, + &dir, + &idir, + isect_t, + &ob_itfm); #else - bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, isect_t); #endif object = subsurface_object; } @@ -109,9 +109,6 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* traversal loop */ do { do { @@ -197,9 +194,9 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, for(; prim_addr < prim_addr2; prim_addr++) { kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); triangle_intersect_subsurface(kg, - &isect_precalc, ss_isect, P, + dir, object, prim_addr, isect_t, diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index 80c8f31473a..c58d3b0316c 100644 --- a/intern/cycles/kernel/bvh/bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -18,7 +18,7 @@ */ #ifdef __QBVH__ -# include "qbvh_traversal.h" +# include "kernel/bvh/qbvh_traversal.h" #endif #if BVH_FEATURE(BVH_HAIR) @@ -104,9 +104,6 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* traversal loop */ do { do { @@ -238,9 +235,9 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, BVH_DEBUG_NEXT_INTERSECTION(); kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); if(triangle_intersect(kg, - &isect_precalc, isect, P, + dir, visibility, object, prim_addr)) @@ -354,11 +351,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, object = kernel_tex_fetch(__prim_object, -prim_addr-1); # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); @@ -391,11 +387,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* instance pop */ # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h index 57e5b8d736d..764aaee44a1 100644 --- a/intern/cycles/kernel/bvh/bvh_volume.h +++ b/intern/cycles/kernel/bvh/bvh_volume.h @@ -18,7 +18,7 @@ */ #ifdef __QBVH__ -# include "qbvh_volume.h" +# include "kernel/bvh/qbvh_volume.h" #endif #if BVH_FEATURE(BVH_HAIR) @@ -97,9 +97,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* traversal loop */ do { do { @@ -194,9 +191,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, continue; } triangle_intersect(kg, - &isect_precalc, isect, P, + dir, visibility, object, prim_addr); @@ -238,13 +235,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); - # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); Psplat[1] = ssef(P.y); @@ -281,13 +276,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* instance pop */ # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); - # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); Psplat[1] = ssef(P.y); diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h index 5a1accebaa0..04ec334e54d 100644 --- a/intern/cycles/kernel/bvh/bvh_volume_all.h +++ b/intern/cycles/kernel/bvh/bvh_volume_all.h @@ -18,7 +18,7 @@ */ #ifdef __QBVH__ -# include "qbvh_volume_all.h" +# include "kernel/bvh/qbvh_volume_all.h" #endif #if BVH_FEATURE(BVH_HAIR) @@ -101,9 +101,6 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif /* __KERNEL_SSE2__ */ - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* traversal loop */ do { do { @@ -199,9 +196,9 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, continue; } hit = triangle_intersect(kg, - &isect_precalc, isect_array, P, + dir, visibility, object, prim_addr); @@ -288,14 +285,12 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, object = kernel_tex_fetch(__prim_object, -prim_addr-1); int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { - # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif - triangle_intersect_precalc(dir, &isect_precalc); num_hits_in_instance = 0; isect_array->t = isect_t; @@ -341,20 +336,17 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, # else bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); # endif - triangle_intersect_precalc(dir, &isect_precalc); /* Scale isect->t to adjust for instancing. */ for(int i = 0; i < num_hits_in_instance; i++) { (isect_array-i-1)->t *= t_fac; } } else { - float ignore_t = FLT_MAX; # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif - triangle_intersect_precalc(dir, &isect_precalc); } isect_t = tmax; diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h index 607295f9ed5..ce474438f2c 100644 --- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h @@ -33,6 +33,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, + const int skip_object, const uint max_hits, uint *num_hits) { @@ -96,15 +97,13 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* Traversal loop. */ do { do { /* Traverse internal nodes. */ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + (void)inodes; if(false #ifdef __VISIBILITY_FLAG__ @@ -270,6 +269,16 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, while(prim_addr < prim_addr2) { kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); +#ifdef __SHADOW_TRICKS__ + uint tri_object = (object == OBJECT_NONE) + ? kernel_tex_fetch(__prim_object, prim_addr) + : object; + if(tri_object == skip_object) { + ++prim_addr; + continue; + } +#endif + bool hit; /* todo: specialized intersect functions which don't fill in @@ -279,9 +288,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, switch(p_type) { case PRIMITIVE_TRIANGLE: { hit = triangle_intersect(kg, - &isect_precalc, isect_array, P, + dir, PATH_RAY_SHADOW, object, prim_addr); @@ -390,9 +399,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, object = kernel_tex_fetch(__prim_object, -prim_addr-1); # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif num_hits_in_instance = 0; @@ -414,8 +423,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - ++stack_ptr; kernel_assert(stack_ptr < BVH_QSTACK_SIZE); traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; @@ -445,11 +452,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, } } else { - float ignore_t = FLT_MAX; # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif } @@ -472,8 +478,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; --stack_ptr; diff --git a/intern/cycles/kernel/bvh/qbvh_subsurface.h b/intern/cycles/kernel/bvh/qbvh_subsurface.h index 84dc4003133..be7658d11d7 100644 --- a/intern/cycles/kernel/bvh/qbvh_subsurface.h +++ b/intern/cycles/kernel/bvh/qbvh_subsurface.h @@ -64,16 +64,16 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { #if BVH_FEATURE(BVH_MOTION) Transform ob_itfm; - bvh_instance_motion_push(kg, - subsurface_object, - ray, - &P, - &dir, - &idir, - &isect_t, - &ob_itfm); + isect_t = bvh_instance_motion_push(kg, + subsurface_object, + ray, + &P, + &dir, + &idir, + isect_t, + &ob_itfm); #else - bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, isect_t); #endif object = subsurface_object; } @@ -105,9 +105,6 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* Traversal loop. */ do { do { @@ -253,9 +250,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, for(; prim_addr < prim_addr2; prim_addr++) { kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); triangle_intersect_subsurface(kg, - &isect_precalc, ss_isect, P, + dir, object, prim_addr, isect_t, diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h index 10ae7bee852..fca75a1d416 100644 --- a/intern/cycles/kernel/bvh/qbvh_traversal.h +++ b/intern/cycles/kernel/bvh/qbvh_traversal.h @@ -106,15 +106,13 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* Traversal loop. */ do { do { /* Traverse internal nodes. */ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + (void)inodes; if(UNLIKELY(node_dist > isect->t) #if BVH_FEATURE(BVH_MOTION) @@ -122,8 +120,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, || UNLIKELY(ray->time > inodes.z) #endif #ifdef __VISIBILITY_FLAG__ - || (__float_as_uint(inodes.x) & visibility) == 0) + || (__float_as_uint(inodes.x) & visibility) == 0 #endif + ) { /* Pop. */ node_addr = traversal_stack[stack_ptr].addr; @@ -333,9 +332,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, BVH_DEBUG_NEXT_INTERSECTION(); kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); if(triangle_intersect(kg, - &isect_precalc, isect, P, + dir, visibility, object, prim_addr)) { @@ -447,8 +446,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - ++stack_ptr; kernel_assert(stack_ptr < BVH_QSTACK_SIZE); traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; @@ -468,9 +465,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Instance pop. */ # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif qbvh_near_far_idx_calc(idir, @@ -489,8 +486,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; node_dist = traversal_stack[stack_ptr].dist; diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h index dc6627e2dbb..192ce009524 100644 --- a/intern/cycles/kernel/bvh/qbvh_volume.h +++ b/intern/cycles/kernel/bvh/qbvh_volume.h @@ -91,9 +91,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* Traversal loop. */ do { do { @@ -266,7 +263,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, continue; } /* Intersect ray against primitive. */ - triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, prim_addr); + triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr); } break; } @@ -295,9 +292,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); # endif qbvh_near_far_idx_calc(idir, @@ -316,8 +313,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - ++stack_ptr; kernel_assert(stack_ptr < BVH_QSTACK_SIZE); traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; @@ -341,9 +336,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Instance pop. */ # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif qbvh_near_far_idx_calc(idir, @@ -362,8 +357,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; --stack_ptr; diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h index ff1fa92af6e..ac5f58a9a51 100644 --- a/intern/cycles/kernel/bvh/qbvh_volume_all.h +++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h @@ -95,9 +95,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* Traversal loop. */ do { do { @@ -271,7 +268,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, continue; } /* Intersect ray against primitive. */ - hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, prim_addr); + hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); if(hit) { /* Move on to next entry in intersections array. */ isect_array++; @@ -346,9 +343,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif qbvh_near_far_idx_calc(idir, @@ -367,7 +364,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); num_hits_in_instance = 0; isect_array->t = isect_t; @@ -406,11 +402,10 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, } } else { - float ignore_t = FLT_MAX; # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif } @@ -433,8 +428,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; --stack_ptr; diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h index b7abc1ec507..e799855a65e 100644 --- a/intern/cycles/kernel/closure/alloc.h +++ b/intern/cycles/kernel/closure/alloc.h @@ -20,17 +20,17 @@ ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType ty { kernel_assert(size <= sizeof(ShaderClosure)); - int num_closure = ccl_fetch(sd, num_closure); - int num_closure_extra = ccl_fetch(sd, num_closure_extra); + int num_closure = sd->num_closure; + int num_closure_extra = sd->num_closure_extra; if(num_closure + num_closure_extra >= MAX_CLOSURE) return NULL; - ShaderClosure *sc = &ccl_fetch(sd, closure)[num_closure]; + ShaderClosure *sc = &sd->closure[num_closure]; sc->type = type; sc->weight = weight; - ccl_fetch(sd, num_closure)++; + sd->num_closure++; return sc; } @@ -44,25 +44,25 @@ ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size) * This lets us keep the same fast array iteration over closures, as we * found linked list iteration and iteration with skipping to be slower. */ int num_extra = ((size + sizeof(ShaderClosure) - 1) / sizeof(ShaderClosure)); - int num_closure = ccl_fetch(sd, num_closure); - int num_closure_extra = ccl_fetch(sd, num_closure_extra) + num_extra; + int num_closure = sd->num_closure; + int num_closure_extra = sd->num_closure_extra + num_extra; if(num_closure + num_closure_extra > MAX_CLOSURE) { /* Remove previous closure. */ - ccl_fetch(sd, num_closure)--; - ccl_fetch(sd, num_closure_extra)++; + sd->num_closure--; + sd->num_closure_extra++; return NULL; } - ccl_fetch(sd, num_closure_extra) = num_closure_extra; - return (ccl_addr_space void*)(ccl_fetch(sd, closure) + MAX_CLOSURE - num_closure_extra); + sd->num_closure_extra = num_closure_extra; + return (ccl_addr_space void*)(sd->closure + MAX_CLOSURE - num_closure_extra); } ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight) { ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight); - if(!sc) + if(sc == NULL) return NULL; float sample_weight = fabsf(average(weight)); diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index 04f9e711c7e..0302fa9b43e 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -14,77 +14,77 @@ * limitations under the License. */ -#include "../closure/bsdf_ashikhmin_velvet.h" -#include "../closure/bsdf_diffuse.h" -#include "../closure/bsdf_oren_nayar.h" -#include "../closure/bsdf_phong_ramp.h" -#include "../closure/bsdf_diffuse_ramp.h" -#include "../closure/bsdf_microfacet.h" -#include "../closure/bsdf_microfacet_multi.h" -#include "../closure/bsdf_reflection.h" -#include "../closure/bsdf_refraction.h" -#include "../closure/bsdf_transparent.h" -#include "../closure/bsdf_ashikhmin_shirley.h" -#include "../closure/bsdf_toon.h" -#include "../closure/bsdf_hair.h" -#include "../closure/bsdf_principled_diffuse.h" -#include "../closure/bsdf_principled_sheen.h" +#include "kernel/closure/bsdf_ashikhmin_velvet.h" +#include "kernel/closure/bsdf_diffuse.h" +#include "kernel/closure/bsdf_oren_nayar.h" +#include "kernel/closure/bsdf_phong_ramp.h" +#include "kernel/closure/bsdf_diffuse_ramp.h" +#include "kernel/closure/bsdf_microfacet.h" +#include "kernel/closure/bsdf_microfacet_multi.h" +#include "kernel/closure/bsdf_reflection.h" +#include "kernel/closure/bsdf_refraction.h" +#include "kernel/closure/bsdf_transparent.h" +#include "kernel/closure/bsdf_ashikhmin_shirley.h" +#include "kernel/closure/bsdf_toon.h" +#include "kernel/closure/bsdf_hair.h" +#include "kernel/closure/bsdf_principled_diffuse.h" +#include "kernel/closure/bsdf_principled_sheen.h" #ifdef __SUBSURFACE__ -# include "../closure/bssrdf.h" +# include "kernel/closure/bssrdf.h" #endif #ifdef __VOLUME__ -# include "../closure/volume.h" +# include "kernel/closure/volume.h" #endif CCL_NAMESPACE_BEGIN ccl_device_forceinline int bsdf_sample(KernelGlobals *kg, - ShaderData *sd, - const ShaderClosure *sc, - float randu, - float randv, - float3 *eval, - float3 *omega_in, - differential3 *domega_in, - float *pdf) + ShaderData *sd, + const ShaderClosure *sc, + float randu, + float randv, + float3 *eval, + float3 *omega_in, + differential3 *domega_in, + float *pdf) { int label; switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #ifdef __OSL__ case CLOSURE_BSDF_PHONG_RAMP_ID: - label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_DIFFUSE_RAMP_ID: - label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #endif case CLOSURE_BSDF_TRANSLUCENT_ID: - label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: @@ -93,63 +93,63 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg, case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: - label = bsdf_microfacet_multi_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, - eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state)); + label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: - label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, - eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state)); + label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: - label = bsdf_principled_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_principled_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: - label = bsdf_principled_sheen_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_principled_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); + label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #endif default: @@ -173,35 +173,35 @@ float3 bsdf_eval(KernelGlobals *kg, { float3 eval; - if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) { + if(dot(sd->Ng, omega_in) >= 0.0f) { switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf); break; #ifdef __OSL__ case CLOSURE_BSDF_PHONG_RAMP_ID: - eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_RAMP_ID: - eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf); break; #endif case CLOSURE_BSDF_TRANSLUCENT_ID: - eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: @@ -209,51 +209,51 @@ float3 bsdf_eval(KernelGlobals *kg, case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: - eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: - eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: - eval = bsdf_principled_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_principled_diffuse_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: - eval = bsdf_principled_sheen_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_principled_sheen_eval_reflect(sc, sd->I, omega_in, pdf); break; #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); break; #endif default: @@ -265,23 +265,23 @@ float3 bsdf_eval(KernelGlobals *kg, switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_TRANSLUCENT_ID: - eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: @@ -289,51 +289,51 @@ float3 bsdf_eval(KernelGlobals *kg, case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: - eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: - eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: - eval = bsdf_principled_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_principled_diffuse_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: - eval = bsdf_principled_sheen_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_principled_sheen_eval_transmit(sc, sd->I, omega_in, pdf); break; #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); break; #endif default: diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h index b74e8ab97cf..58f6140970d 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h @@ -327,7 +327,7 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur (bsdf_a->alpha_y == bsdf_b->alpha_y) && (isequal_float3(bsdf_a->T, bsdf_b->T)) && (bsdf_a->ior == bsdf_b->ior) && - ((!bsdf_a->extra && !bsdf_b->extra) || + ((bsdf_a->extra == NULL && bsdf_b->extra == NULL) || ((bsdf_a->extra && bsdf_b->extra) && (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color)))); } diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h index a49b0717a3d..57f1e733ee7 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h @@ -43,7 +43,7 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha) ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU) { if(cosI > 0.9999f || cosI < 1e-6f) { - const float r = sqrtf(randU.x / (1.0f - randU.x)); + const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f)); const float phi = M_2PI_F * randU.y; return make_float2(r*cosf(phi), r*sinf(phi)); } @@ -83,7 +83,7 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z)); const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU); - const float2 cossin_phi = normalize(make_float2(wi_11.x, wi_11.y)); + const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f)); const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y); const float slope_y = alpha.y*(cossin_phi.y * slope_11.x + cossin_phi.x * slope_11.y); @@ -313,18 +313,18 @@ ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, cons #define MF_PHASE_FUNCTION glass #define MF_MULTI_GLASS -#include "bsdf_microfacet_multi_impl.h" +#include "kernel/closure/bsdf_microfacet_multi_impl.h" /* The diffuse phase function is not implemented as a node yet. */ #if 0 #define MF_PHASE_FUNCTION diffuse #define MF_MULTI_DIFFUSE -#include "bsdf_microfacet_multi_impl.h" +#include "kernel/closure/bsdf_microfacet_multi_impl.h" #endif #define MF_PHASE_FUNCTION glossy #define MF_MULTI_GLOSSY -#include "bsdf_microfacet_multi_impl.h" +#include "kernel/closure/bsdf_microfacet_multi_impl.h" ccl_device void bsdf_microfacet_multi_ggx_blur(ShaderClosure *sc, float roughness) { diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h index 6838e26c242..c623e3490fd 100644 --- a/intern/cycles/kernel/geom/geom.h +++ b/intern/cycles/kernel/geom/geom.h @@ -14,19 +14,19 @@ * limitations under the License. */ -#include "geom_attribute.h" -#include "geom_object.h" +#include "kernel/geom/geom_attribute.h" +#include "kernel/geom/geom_object.h" #ifdef __PATCH_EVAL__ -# include "geom_patch.h" +# include "kernel/geom/geom_patch.h" #endif -#include "geom_triangle.h" -#include "geom_subd_triangle.h" -#include "geom_triangle_intersect.h" -#include "geom_motion_triangle.h" -#include "geom_motion_triangle_intersect.h" -#include "geom_motion_triangle_shader.h" -#include "geom_motion_curve.h" -#include "geom_curve.h" -#include "geom_volume.h" -#include "geom_primitive.h" +#include "kernel/geom/geom_triangle.h" +#include "kernel/geom/geom_subd_triangle.h" +#include "kernel/geom/geom_triangle_intersect.h" +#include "kernel/geom/geom_motion_triangle.h" +#include "kernel/geom/geom_motion_triangle_intersect.h" +#include "kernel/geom/geom_motion_triangle_shader.h" +#include "kernel/geom/geom_motion_curve.h" +#include "kernel/geom/geom_curve.h" +#include "kernel/geom/geom_volume.h" +#include "kernel/geom/geom_primitive.h" diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h index 08ccee56335..cc62192ef21 100644 --- a/intern/cycles/kernel/geom/geom_attribute.h +++ b/intern/cycles/kernel/geom/geom_attribute.h @@ -30,7 +30,7 @@ ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData * ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd) { #ifdef __HAIR__ - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + if(sd->type & PRIMITIVE_ALL_CURVE) { return ATTR_PRIM_CURVE; } else @@ -53,12 +53,12 @@ ccl_device_inline AttributeDescriptor attribute_not_found() ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id) { - if(ccl_fetch(sd, object) == PRIM_NONE) { + if(sd->object == PRIM_NONE) { return attribute_not_found(); } /* for SVM, find attribute by unique id */ - uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride; + uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride; attr_offset += attribute_primitive_type(kg, sd); uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset); @@ -73,7 +73,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const Sh AttributeDescriptor desc; desc.element = (AttributeElement)attr_map.y; - if(ccl_fetch(sd, prim) == PRIM_NONE && + if(sd->prim == PRIM_NONE && desc.element != ATTR_ELEMENT_MESH && desc.element != ATTR_ELEMENT_VOXEL && desc.element != ATTR_ELEMENT_OBJECT) diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index 9de335403ce..bb33b91847e 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -22,6 +22,12 @@ CCL_NAMESPACE_BEGIN #ifdef __HAIR__ +#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300) +# define ccl_device_curveintersect ccl_device +#else +# define ccl_device_curveintersect ccl_device_forceinline +#endif + /* Reading attributes on various curve elements */ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy) @@ -32,22 +38,22 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, if(dy) *dy = 0.0f; #endif - return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim)); + return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim); } else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float f0 = kernel_tex_fetch(__attributes_float, desc.offset + k0); float f1 = kernel_tex_fetch(__attributes_float, desc.offset + k1); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0); + if(dx) *dx = sd->du.dx*(f1 - f0); if(dy) *dy = 0.0f; #endif - return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1; + return (1.0f - sd->u)*f0 + sd->u*f1; } else { #ifdef __RAY_DIFFERENTIALS__ @@ -71,22 +77,22 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); #endif - return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim))); + return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim)); } else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0)); float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1)); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0); + if(dx) *dx = sd->du.dx*(f1 - f0); if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); #endif - return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1; + return (1.0f - sd->u)*f0 + sd->u*f1; } else { #ifdef __RAY_DIFFERENTIALS__ @@ -104,22 +110,22 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) { float r = 0.0f; - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + if(sd->type & PRIMITIVE_ALL_CURVE) { + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float4 P_curve[2]; - if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) { + if(sd->type & PRIMITIVE_CURVE) { P_curve[0]= kernel_tex_fetch(__curve_keys, k0); P_curve[1]= kernel_tex_fetch(__curve_keys, k1); } else { - motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve); + motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); } - r = (P_curve[1].w - P_curve[0].w) * ccl_fetch(sd, u) + P_curve[0].w; + r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w; } return r*2.0f; @@ -130,8 +136,8 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float4 P_curve[2]; @@ -139,7 +145,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd P_curve[0]= kernel_tex_fetch(__curve_keys, k0); P_curve[1]= kernel_tex_fetch(__curve_keys, k1); - return float4_to_float3(P_curve[1]) * ccl_fetch(sd, u) + float4_to_float3(P_curve[0]) * (1.0f - ccl_fetch(sd, u)); + return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u); } /* Curve tangent normal */ @@ -148,14 +154,14 @@ ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd) { float3 tgN = make_float3(0.0f,0.0f,0.0f); - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + if(sd->type & PRIMITIVE_ALL_CURVE) { - tgN = -(-ccl_fetch(sd, I) - ccl_fetch(sd, dPdu) * (dot(ccl_fetch(sd, dPdu),-ccl_fetch(sd, I)) / len_squared(ccl_fetch(sd, dPdu)))); + tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu))); tgN = normalize(tgN); /* need to find suitable scaled gd for corrected normal */ #if 0 - tgN = normalize(tgN - gd * ccl_fetch(sd, dPdu)); + tgN = normalize(tgN - gd * sd->dPdu); #endif } @@ -222,13 +228,22 @@ ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a) #ifdef __KERNEL_SSE2__ /* Pass P and dir by reference to aligned vector */ -ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, +ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) #else -ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, +ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax) #endif { + const bool is_curve_primitive = (type & PRIMITIVE_CURVE); + + if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { + const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); + if(time < prim_time.x || time > prim_time.y) { + return false; + } + } + int segment = PRIMITIVE_UNPACK_SEGMENT(type); float epsilon = 0.0f; float r_st, r_en; @@ -255,9 +270,9 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte int ka = max(k0 - 1, v00.x); int kb = min(k1 + 1, v00.x + v00.y - 1); -#ifdef __KERNEL_AVX2__ +#if defined(__KERNEL_AVX2__) && (!defined(_MSC_VER) || _MSC_VER > 1800) avxf P_curve_0_1, P_curve_2_3; - if(type & PRIMITIVE_CURVE) { + if(is_curve_primitive) { P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x); P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x); } @@ -268,7 +283,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte #else /* __KERNEL_AVX2__ */ ssef P_curve[4]; - if(type & PRIMITIVE_CURVE) { + if(is_curve_primitive) { P_curve[0] = load4f(&kg->__curve_keys.data[ka].x); P_curve[1] = load4f(&kg->__curve_keys.data[k0].x); P_curve[2] = load4f(&kg->__curve_keys.data[k1].x); @@ -290,7 +305,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0); ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); -#ifdef __KERNEL_AVX2__ +#if defined(__KERNEL_AVX2__) && (!defined(_MSC_VER) || _MSC_VER > 1800) const avxf vPP = _mm256_broadcast_ps(&P.m128); const avxf htfm00 = avxf(htfm0.m128, htfm0.m128); const avxf htfm11 = avxf(htfm1.m128, htfm1.m128); @@ -363,7 +378,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte float4 P_curve[4]; - if(type & PRIMITIVE_CURVE) { + if(is_curve_primitive) { P_curve[0] = kernel_tex_fetch(__curve_keys, ka); P_curve[1] = kernel_tex_fetch(__curve_keys, k0); P_curve[2] = kernel_tex_fetch(__curve_keys, k1); @@ -679,7 +694,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte return hit; } -ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, +ccl_device_curveintersect bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) { /* define few macros to minimize code duplication for SSE */ @@ -689,6 +704,15 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection # define dot3(x, y) dot(x, y) #endif + const bool is_curve_primitive = (type & PRIMITIVE_CURVE); + + if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { + const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); + if(time < prim_time.x || time > prim_time.y) { + return false; + } + } + int segment = PRIMITIVE_UNPACK_SEGMENT(type); /* curve Intersection check */ int flags = kernel_data.curve.curveflags; @@ -703,7 +727,7 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection #ifndef __KERNEL_SSE2__ float4 P_curve[2]; - if(type & PRIMITIVE_CURVE) { + if(is_curve_primitive) { P_curve[0] = kernel_tex_fetch(__curve_keys, k0); P_curve[1] = kernel_tex_fetch(__curve_keys, k1); } @@ -738,7 +762,7 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection #else ssef P_curve[2]; - if(type & PRIMITIVE_CURVE) { + if(is_curve_primitive) { P_curve[0] = load4f(&kg->__curve_keys.data[k0].x); P_curve[1] = load4f(&kg->__curve_keys.data[k1].x); } @@ -948,7 +972,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; #else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); #endif @@ -961,7 +985,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con int prim = kernel_tex_fetch(__prim_index, isect->prim); float4 v00 = kernel_tex_fetch(__curves, prim); - int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float3 tg; @@ -972,14 +996,14 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con float4 P_curve[4]; - if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) { + if(sd->type & PRIMITIVE_CURVE) { P_curve[0] = kernel_tex_fetch(__curve_keys, ka); P_curve[1] = kernel_tex_fetch(__curve_keys, k0); P_curve[2] = kernel_tex_fetch(__curve_keys, k1); P_curve[3] = kernel_tex_fetch(__curve_keys, kb); } else { - motion_cardinal_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), ka, k0, k1, kb, P_curve); + motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve); } float3 p[4]; @@ -991,43 +1015,43 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con P = P + D*t; #ifdef __UV__ - ccl_fetch(sd, u) = isect->u; - ccl_fetch(sd, v) = 0.0f; + sd->u = isect->u; + sd->v = 0.0f; #endif tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) { - ccl_fetch(sd, Ng) = normalize(-(D - tg * (dot(tg, D)))); + sd->Ng = normalize(-(D - tg * (dot(tg, D)))); } else { /* direction from inside to surface of curve */ float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); - ccl_fetch(sd, Ng) = normalize(P - p_curr); + sd->Ng = normalize(P - p_curr); /* adjustment for changing radius */ float gd = isect->v; if(gd != 0.0f) { - ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg; - ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng)); + sd->Ng = sd->Ng - gd * tg; + sd->Ng = normalize(sd->Ng); } } /* todo: sometimes the normal is still so that this is detected as * backfacing even if cull backfaces is enabled */ - ccl_fetch(sd, N) = ccl_fetch(sd, Ng); + sd->N = sd->Ng; } else { float4 P_curve[2]; - if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) { + if(sd->type & PRIMITIVE_CURVE) { P_curve[0]= kernel_tex_fetch(__curve_keys, k0); P_curve[1]= kernel_tex_fetch(__curve_keys, k1); } else { - motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve); + motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); } float l = 1.0f; @@ -1038,39 +1062,39 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con float3 dif = P - float4_to_float3(P_curve[0]); #ifdef __UV__ - ccl_fetch(sd, u) = dot(dif,tg)/l; - ccl_fetch(sd, v) = 0.0f; + sd->u = dot(dif,tg)/l; + sd->v = 0.0f; #endif if(flag & CURVE_KN_TRUETANGENTGNORMAL) { - ccl_fetch(sd, Ng) = -(D - tg * dot(tg, D)); - ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng)); + sd->Ng = -(D - tg * dot(tg, D)); + sd->Ng = normalize(sd->Ng); } else { float gd = isect->v; /* direction from inside to surface of curve */ - ccl_fetch(sd, Ng) = (dif - tg * ccl_fetch(sd, u) * l) / (P_curve[0].w + ccl_fetch(sd, u) * l * gd); + sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd); /* adjustment for changing radius */ if(gd != 0.0f) { - ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg; - ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng)); + sd->Ng = sd->Ng - gd * tg; + sd->Ng = normalize(sd->Ng); } } - ccl_fetch(sd, N) = ccl_fetch(sd, Ng); + sd->N = sd->Ng; } #ifdef __DPDU__ /* dPdu/dPdv */ - ccl_fetch(sd, dPdu) = tg; - ccl_fetch(sd, dPdv) = cross(tg, ccl_fetch(sd, Ng)); + sd->dPdu = tg; + sd->dPdv = cross(tg, sd->Ng); #endif if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; #else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); #endif diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h index d57d74ea882..f74995becf5 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h @@ -48,7 +48,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, return P; } # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; # else Transform tfm = object_fetch_transform(kg, isect->object, @@ -77,7 +77,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; # else Transform tfm = object_fetch_transform(kg, isect->object, @@ -116,7 +116,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg, # ifdef __INTERSECTION_REFINE__ if(isect->object != OBJECT_NONE) { # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; # else Transform tfm = object_fetch_transform(kg, isect->object, @@ -144,7 +144,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; # else Transform tfm = object_fetch_transform(kg, isect->object, @@ -166,14 +166,15 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg, * time and do a ray intersection with the resulting triangle. */ -ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, - Intersection *isect, - float3 P, - float3 dir, - float time, - uint visibility, - int object, - int prim_addr) +ccl_device_inline bool motion_triangle_intersect( + KernelGlobals *kg, + Intersection *isect, + float3 P, + float3 dir, + float time, + uint visibility, + int object, + int prim_addr) { /* Primitive index for vertex location lookup. */ int prim = kernel_tex_fetch(__prim_index, prim_addr); @@ -185,11 +186,15 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, motion_triangle_vertices(kg, fobject, prim, time, verts); /* Ray-triangle intersection, unoptimized. */ float t, u, v; - if(ray_triangle_intersect_uv(P, - dir, - isect->t, - verts[2], verts[0], verts[1], - &u, &v, &t)) + if(ray_triangle_intersect(P, + dir, + isect->t, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + (ssef*)verts, +#else + verts[0], verts[1], verts[2], +#endif + &u, &v, &t)) { #ifdef __VISIBILITY_FLAG__ /* Visibility flag test. we do it here under the assumption @@ -237,11 +242,15 @@ ccl_device_inline void motion_triangle_intersect_subsurface( motion_triangle_vertices(kg, fobject, prim, time, verts); /* Ray-triangle intersection, unoptimized. */ float t, u, v; - if(ray_triangle_intersect_uv(P, - dir, - tmax, - verts[2], verts[0], verts[1], - &u, &v, &t)) + if(ray_triangle_intersect(P, + dir, + tmax, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + (ssef*)verts, +#else + verts[0], verts[1], verts[2], +#endif + &u, &v, &t)) { for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) { if(ss_isect->hits[i].t == t) { diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h index 0e024a05db6..cb456056e20 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h @@ -39,26 +39,26 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, bool subsurface) { /* Get shader. */ - ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim)); + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); /* Get motion info. */ /* TODO(sergey): This logic is really similar to motion_triangle_vertices(), * can we de-duplicate something here? */ int numsteps, numverts; - object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL); + object_motion_info(kg, sd->object, &numsteps, &numverts, NULL); /* Figure out which steps we need to fetch and their interpolation factor. */ int maxstep = numsteps*2; - int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1); - float t = ccl_fetch(sd, time)*maxstep - step; + int step = min((int)(sd->time*maxstep), maxstep-1); + float t = sd->time*maxstep - step; /* Find attribute. */ AttributeElement elem; - int offset = find_attribute_motion(kg, ccl_fetch(sd, object), + int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_POSITION, &elem); kernel_assert(offset != ATTR_STD_NOT_FOUND); /* Fetch vertex coordinates. */ float3 verts[3], next_verts[3]; - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts); /* Interpolate between steps. */ @@ -68,7 +68,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, /* Compute refined position. */ #ifdef __SUBSURFACE__ if(subsurface) { - ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg, + sd->P = motion_triangle_refine_subsurface(kg, sd, isect, ray, @@ -77,29 +77,29 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, else #endif /* __SUBSURFACE__*/ { - ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts); + sd->P = motion_triangle_refine(kg, sd, isect, ray, verts); } /* Compute face normal. */ float3 Ng; - if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { + if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0])); } else { Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0])); } - ccl_fetch(sd, Ng) = Ng; - ccl_fetch(sd, N) = Ng; + sd->Ng = Ng; + sd->N = Ng; /* Compute derivatives of P w.r.t. uv. */ #ifdef __DPDU__ - ccl_fetch(sd, dPdu) = (verts[0] - verts[2]); - ccl_fetch(sd, dPdv) = (verts[1] - verts[2]); + sd->dPdu = (verts[0] - verts[2]); + sd->dPdv = (verts[1] - verts[2]); #endif /* Compute smooth normal. */ - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { + if(sd->shader & SHADER_SMOOTH_NORMAL) { /* Find attribute. */ AttributeElement elem; int offset = find_attribute_motion(kg, - ccl_fetch(sd, object), + sd->object, ATTR_STD_MOTION_VERTEX_NORMAL, &elem); kernel_assert(offset != ATTR_STD_NOT_FOUND); @@ -112,10 +112,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, normals[1] = (1.0f - t)*normals[1] + t*next_normals[1]; normals[2] = (1.0f - t)*normals[2] + t*next_normals[2]; /* Interpolate between vertices. */ - float u = ccl_fetch(sd, u); - float v = ccl_fetch(sd, v); + float u = sd->u; + float v = sd->v; float w = 1.0f - u - v; - ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]); + sd->N = (u*normals[0] + v*normals[1] + w*normals[2]); } } diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h index f51b2d18657..6ecdfe0173a 100644 --- a/intern/cycles/kernel/geom/geom_object.h +++ b/intern/cycles/kernel/geom/geom_object.h @@ -137,9 +137,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P) { #ifdef __OBJECT_MOTION__ - *P = transform_point_auto(&ccl_fetch(sd, ob_tfm), *P); + *P = transform_point_auto(&sd->ob_tfm, *P); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); *P = transform_point(&tfm, *P); #endif } @@ -149,9 +149,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P) { #ifdef __OBJECT_MOTION__ - *P = transform_point_auto(&ccl_fetch(sd, ob_itfm), *P); + *P = transform_point_auto(&sd->ob_itfm, *P); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); *P = transform_point(&tfm, *P); #endif } @@ -161,12 +161,12 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N) { #ifdef __OBJECT_MOTION__ - if((ccl_fetch(sd, object) != OBJECT_NONE) || (ccl_fetch(sd, type) == PRIMITIVE_LAMP)) { - *N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N)); + if((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) { + *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N)); } #else - if(ccl_fetch(sd, object) != OBJECT_NONE) { - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + if(sd->object != OBJECT_NONE) { + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); *N = normalize(transform_direction_transposed(&tfm, *N)); } #endif @@ -177,9 +177,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N) { #ifdef __OBJECT_MOTION__ - *N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_itfm), *N)); + *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N)); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); *N = normalize(transform_direction_transposed(&tfm, *N)); #endif } @@ -189,9 +189,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D) { #ifdef __OBJECT_MOTION__ - *D = transform_direction_auto(&ccl_fetch(sd, ob_tfm), *D); + *D = transform_direction_auto(&sd->ob_tfm, *D); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); *D = transform_direction(&tfm, *D); #endif } @@ -201,9 +201,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D) { #ifdef __OBJECT_MOTION__ - *D = transform_direction_auto(&ccl_fetch(sd, ob_itfm), *D); + *D = transform_direction_auto(&sd->ob_itfm, *D); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); *D = transform_direction(&tfm, *D); #endif } @@ -212,13 +212,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd) { - if(ccl_fetch(sd, object) == OBJECT_NONE) + if(sd->object == OBJECT_NONE) return make_float3(0.0f, 0.0f, 0.0f); #ifdef __OBJECT_MOTION__ - return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w); + return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); return make_float3(tfm.x.w, tfm.y.w, tfm.z.w); #endif } @@ -326,7 +326,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object) ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd) { - return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE + 1); + return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE + 1); } /* Particle data from which object was instanced */ @@ -425,7 +425,13 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir) /* Transform ray into object space to enter static object in BVH */ -ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t) +ccl_device_inline float bvh_instance_push(KernelGlobals *kg, + int object, + const Ray *ray, + float3 *P, + float3 *dir, + float3 *idir, + float t) { Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); @@ -435,8 +441,11 @@ ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ra *dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len)); *idir = bvh_inverse_direction(*dir); - if(*t != FLT_MAX) - *t *= len; + if(t != FLT_MAX) { + t *= len; + } + + return t; } #ifdef __QBVH__ @@ -473,16 +482,24 @@ ccl_device_inline void qbvh_instance_push(KernelGlobals *kg, /* Transorm ray to exit static object in BVH */ -ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t) +ccl_device_inline float bvh_instance_pop(KernelGlobals *kg, + int object, + const Ray *ray, + float3 *P, + float3 *dir, + float3 *idir, + float t) { - if(*t != FLT_MAX) { + if(t != FLT_MAX) { Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); - *t /= len(transform_direction(&tfm, ray->D)); + t /= len(transform_direction(&tfm, ray->D)); } *P = ray->P; *dir = bvh_clamp_direction(ray->D); *idir = bvh_inverse_direction(*dir); + + return t; } /* Same as above, but returns scale factor to apply to multiple intersection distances */ @@ -501,13 +518,13 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, int object, co #ifdef __OBJECT_MOTION__ /* Transform ray into object space to enter motion blurred object in BVH */ -ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, +ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, - ccl_addr_space float *t, + float t, Transform *itfm) { object_fetch_transform_motion_test(kg, object, ray->time, itfm); @@ -518,8 +535,11 @@ ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, *dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len)); *idir = bvh_inverse_direction(*dir); - if(*t != FLT_MAX) - *t *= len; + if(t != FLT_MAX) { + t *= len; + } + + return t; } #ifdef __QBVH__ @@ -557,22 +577,24 @@ ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg, /* Transorm ray to exit motion blurred object in BVH */ -ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, - int object, - const Ray *ray, - float3 *P, - float3 *dir, - float3 *idir, - ccl_addr_space float *t, - Transform *itfm) -{ - if(*t != FLT_MAX) { - *t /= len(transform_direction(itfm, ray->D)); +ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg, + int object, + const Ray *ray, + float3 *P, + float3 *dir, + float3 *idir, + float t, + Transform *itfm) +{ + if(t != FLT_MAX) { + t /= len(transform_direction(itfm, ray->D)); } *P = ray->P; *dir = bvh_clamp_direction(ray->D); *idir = bvh_inverse_direction(*dir); + + return t; } /* Same as above, but returns scale factor to apply to multiple intersection distances */ diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h index 6a0ff5a4a04..5663b598508 100644 --- a/intern/cycles/kernel/geom/geom_patch.h +++ b/intern/cycles/kernel/geom/geom_patch.h @@ -267,7 +267,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg, const ShaderData *sd, int o float weights_du[PATCH_MAX_CONTROL_VERTS]; float weights_dv[PATCH_MAX_CONTROL_VERTS]; - int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel, + int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel, indices, weights, weights_du, weights_dv); float val = 0.0f; @@ -294,7 +294,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg, const ShaderData *sd, int float weights_du[PATCH_MAX_CONTROL_VERTS]; float weights_dv[PATCH_MAX_CONTROL_VERTS]; - int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel, + int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel, indices, weights, weights_du, weights_dv); float3 val = make_float3(0.0f, 0.0f, 0.0f); @@ -321,7 +321,7 @@ ccl_device float3 patch_eval_uchar4(KernelGlobals *kg, const ShaderData *sd, int float weights_du[PATCH_MAX_CONTROL_VERTS]; float weights_dv[PATCH_MAX_CONTROL_VERTS]; - int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel, + int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel, indices, weights, weights_du, weights_dv); float3 val = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h index 8a73bb2f78b..90a9c2147cc 100644 --- a/intern/cycles/kernel/geom/geom_primitive.h +++ b/intern/cycles/kernel/geom/geom_primitive.h @@ -28,19 +28,19 @@ ccl_device_inline float primitive_attribute_float(KernelGlobals *kg, const AttributeDescriptor desc, float *dx, float *dy) { - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { + if(sd->type & PRIMITIVE_ALL_TRIANGLE) { if(subd_triangle_patch(kg, sd) == ~0) return triangle_attribute_float(kg, sd, desc, dx, dy); else return subd_triangle_attribute_float(kg, sd, desc, dx, dy); } #ifdef __HAIR__ - else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + else if(sd->type & PRIMITIVE_ALL_CURVE) { return curve_attribute_float(kg, sd, desc, dx, dy); } #endif #ifdef __VOLUME__ - else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) { + else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) { return volume_attribute_float(kg, sd, desc, dx, dy); } #endif @@ -56,19 +56,19 @@ ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg, const AttributeDescriptor desc, float3 *dx, float3 *dy) { - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { + if(sd->type & PRIMITIVE_ALL_TRIANGLE) { if(subd_triangle_patch(kg, sd) == ~0) return triangle_attribute_float3(kg, sd, desc, dx, dy); else return subd_triangle_attribute_float3(kg, sd, desc, dx, dy); } #ifdef __HAIR__ - else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + else if(sd->type & PRIMITIVE_ALL_CURVE) { return curve_attribute_float3(kg, sd, desc, dx, dy); } #endif #ifdef __VOLUME__ - else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) { + else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) { return volume_attribute_float3(kg, sd, desc, dx, dy); } #endif @@ -118,9 +118,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) { #ifdef __HAIR__ - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) + if(sd->type & PRIMITIVE_ALL_CURVE) # ifdef __DPDU__ - return normalize(ccl_fetch(sd, dPdu)); + return normalize(sd->dPdu); # else return make_float3(0.0f, 0.0f, 0.0f); # endif @@ -133,12 +133,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) float3 data = primitive_attribute_float3(kg, sd, desc, NULL, NULL); data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f); object_normal_transform(kg, sd, &data); - return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N)))); + return cross(sd->N, normalize(cross(data, sd->N))); } else { /* otherwise use surface derivatives */ #ifdef __DPDU__ - return normalize(ccl_fetch(sd, dPdu)); + return normalize(sd->dPdu); #else return make_float3(0.0f, 0.0f, 0.0f); #endif @@ -153,17 +153,17 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData * float3 center; #ifdef __HAIR__ - bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE; + bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE; if(is_curve_primitive) { center = curve_motion_center_location(kg, sd); - if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) { + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { object_position_transform(kg, sd, ¢er); } } else #endif - center = ccl_fetch(sd, P); + center = sd->P; float3 motion_pre = center, motion_post = center; @@ -173,16 +173,16 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData * if(desc.offset != ATTR_STD_NOT_FOUND) { /* get motion info */ int numverts, numkeys; - object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys); + object_motion_info(kg, sd->object, NULL, &numverts, &numkeys); /* lookup attributes */ motion_pre = primitive_attribute_float3(kg, sd, desc, NULL, NULL); - desc.offset += (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys; + desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys; motion_post = primitive_attribute_float3(kg, sd, desc, NULL, NULL); #ifdef __HAIR__ - if(is_curve_primitive && (ccl_fetch(sd, object_flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) { + if(is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) { object_position_transform(kg, sd, &motion_pre); object_position_transform(kg, sd, &motion_post); } @@ -193,10 +193,10 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData * * transformation was set match the world/object space of motion_pre/post */ Transform tfm; - tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE); + tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE); motion_pre = transform_point(&tfm, motion_pre); - tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST); + tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST); motion_post = transform_point(&tfm, motion_post); float3 motion_center; diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h index 647840dc696..044e82f03d4 100644 --- a/intern/cycles/kernel/geom/geom_subd_triangle.h +++ b/intern/cycles/kernel/geom/geom_subd_triangle.h @@ -22,14 +22,14 @@ CCL_NAMESPACE_BEGIN ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd) { - return (ccl_fetch(sd, prim) != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, ccl_fetch(sd, prim)) : ~0; + return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0; } /* UV coords of triangle within patch */ ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg, const ShaderData *sd, float2 uv[3]) { - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); uv[0] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.x); uv[1] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.y); @@ -110,7 +110,7 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float2 dpdv = uv[1] - uv[2]; /* p is [s, t] */ - float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2]; + float2 p = dpdu * sd->u + dpdv * sd->v + uv[2]; float a, dads, dadt; a = patch_eval_float(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt); @@ -123,8 +123,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float dtdv = dpdv.y; if(dx) { - float dudx = ccl_fetch(sd, du).dx; - float dvdx = ccl_fetch(sd, dv).dx; + float dudx = sd->du.dx; + float dvdx = sd->dv.dx; float dsdx = dsdu*dudx + dsdv*dvdx; float dtdx = dtdu*dudx + dtdv*dvdx; @@ -132,8 +132,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const *dx = dads*dsdx + dadt*dtdx; } if(dy) { - float dudy = ccl_fetch(sd, du).dy; - float dvdy = ccl_fetch(sd, dv).dy; + float dudy = sd->du.dy; + float dvdy = sd->dv.dy; float dsdy = dsdu*dudy + dsdv*dvdy; float dtdy = dtdu*dudy + dtdv*dvdy; @@ -174,11 +174,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else if(desc.element == ATTR_ELEMENT_CORNER) { float2 uv[3]; @@ -202,11 +202,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else { if(dx) *dx = 0.0f; @@ -229,7 +229,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float2 dpdv = uv[1] - uv[2]; /* p is [s, t] */ - float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2]; + float2 p = dpdu * sd->u + dpdv * sd->v + uv[2]; float3 a, dads, dadt; @@ -248,8 +248,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float dtdv = dpdv.y; if(dx) { - float dudx = ccl_fetch(sd, du).dx; - float dvdx = ccl_fetch(sd, dv).dx; + float dudx = sd->du.dx; + float dvdx = sd->dv.dx; float dsdx = dsdu*dudx + dsdv*dvdx; float dtdx = dtdu*dudx + dtdv*dvdx; @@ -257,8 +257,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con *dx = dads*dsdx + dadt*dtdx; } if(dy) { - float dudy = ccl_fetch(sd, du).dy; - float dvdy = ccl_fetch(sd, dv).dy; + float dudy = sd->du.dy; + float dvdy = sd->dv.dy; float dsdy = dsdu*dudy + dsdv*dvdy; float dtdy = dtdu*dudy + dtdv*dvdy; @@ -299,11 +299,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) { float2 uv[3]; @@ -337,11 +337,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else { if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index 3229091bbb0..47778553b94 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -26,13 +26,13 @@ CCL_NAMESPACE_BEGIN ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd) { /* load triangle vertices */ - const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); const float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0)); const float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1)); const float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2)); /* return normal */ - if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { + if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { return normalize(cross(v2 - v0, v1 - v0)); } else { @@ -110,34 +110,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s if(dx) *dx = 0.0f; if(dy) *dy = 0.0f; - return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim)); + return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim); } else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) { - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); float f0 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.x); float f1 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.y); float f2 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.z); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else if(desc.element == ATTR_ELEMENT_CORNER) { - int tri = desc.offset + ccl_fetch(sd, prim)*3; + int tri = desc.offset + sd->prim*3; float f0 = kernel_tex_fetch(__attributes_float, tri + 0); float f1 = kernel_tex_fetch(__attributes_float, tri + 1); float f2 = kernel_tex_fetch(__attributes_float, tri + 2); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else { if(dx) *dx = 0.0f; @@ -153,24 +153,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); - return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim))); + return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim)); } else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) { - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x)); float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y)); float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z)); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) { - int tri = desc.offset + ccl_fetch(sd, prim)*3; + int tri = desc.offset + sd->prim*3; float3 f0, f1, f2; if(desc.element == ATTR_ELEMENT_CORNER) { @@ -185,11 +185,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData } #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else { if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h index 4db121d94f4..804e74d7e37 100644 --- a/intern/cycles/kernel/geom/geom_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h @@ -22,232 +22,50 @@ CCL_NAMESPACE_BEGIN -/* Workaround stupidness of CUDA/OpenCL which doesn't allow to access indexed - * component of float3 value. - */ -#ifndef __KERNEL_CPU__ -# define IDX(vec, idx) \ - ((idx == 0) ? ((vec).x) : ( (idx == 1) ? ((vec).y) : ((vec).z) )) -#else -# define IDX(vec, idx) ((vec)[idx]) -#endif - -/* Ray-Triangle intersection for BVH traversal - * - * Sven Woop - * Watertight Ray/Triangle Intersection - * - * http://jcgt.org/published/0002/01/05/paper.pdf - */ - -/* Precalculated data for the ray->tri intersection. */ -typedef struct IsectPrecalc { - /* Maximal dimension kz, and orthogonal dimensions. */ - int kx, ky, kz; - - /* Shear constants. */ - float Sx, Sy, Sz; -} IsectPrecalc; - -#if (defined(__KERNEL_OPENCL_APPLE__)) || \ - (defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86))) -ccl_device_noinline -#else -ccl_device_inline -#endif -void triangle_intersect_precalc(float3 dir, - IsectPrecalc *isect_precalc) -{ - /* Calculate dimension where the ray direction is maximal. */ -#ifndef __KERNEL_SSE__ - int kz = util_max_axis(make_float3(fabsf(dir.x), - fabsf(dir.y), - fabsf(dir.z))); - int kx = kz + 1; if(kx == 3) kx = 0; - int ky = kx + 1; if(ky == 3) ky = 0; -#else - int kx, ky, kz; - /* Avoiding mispredicted branch on direction. */ - kz = util_max_axis(fabs(dir)); - static const char inc_xaxis[] = {1, 2, 0, 55}; - static const char inc_yaxis[] = {2, 0, 1, 55}; - kx = inc_xaxis[kz]; - ky = inc_yaxis[kz]; -#endif - - float dir_kz = IDX(dir, kz); - - /* Swap kx and ky dimensions to preserve winding direction of triangles. */ - if(dir_kz < 0.0f) { - int tmp = kx; - kx = ky; - ky = tmp; - } - - /* Calculate the shear constants. */ - float inv_dir_z = 1.0f / dir_kz; - isect_precalc->Sx = IDX(dir, kx) * inv_dir_z; - isect_precalc->Sy = IDX(dir, ky) * inv_dir_z; - isect_precalc->Sz = inv_dir_z; - - /* Store the dimensions. */ - isect_precalc->kx = kx; - isect_precalc->ky = ky; - isect_precalc->kz = kz; -} - -/* TODO(sergey): Make it general utility function. */ -ccl_device_inline float xor_signmask(float x, int y) -{ - return __int_as_float(__float_as_int(x) ^ y); -} - ccl_device_inline bool triangle_intersect(KernelGlobals *kg, - const IsectPrecalc *isect_precalc, Intersection *isect, float3 P, + float3 dir, uint visibility, int object, int prim_addr) { - const int kx = isect_precalc->kx; - const int ky = isect_precalc->ky; - const int kz = isect_precalc->kz; - const float Sx = isect_precalc->Sx; - const float Sy = isect_precalc->Sy; - const float Sz = isect_precalc->Sz; - - /* Calculate vertices relative to ray origin. */ const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr); - -#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) - const avxf avxf_P(P.m128, P.m128); - - const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0); - const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1); - - const avxf AB = tri_ab - avxf_P; - const avxf BC = tri_bc - avxf_P; - - const __m256i permute_mask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx); - - const avxf AB_k = shuffle(AB, permute_mask); - const avxf BC_k = shuffle(BC, permute_mask); - - /* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */ - const avxf ABBC_kz = shuffle<2>(AB_k, BC_k); - - /* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */ - const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k); - - const avxf Sxy(Sy, Sx, Sy, Sx); - - /* Ax, Ay, Bx, By, Bx, By, Cx, Cy */ - const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy); - - float ABBC_kz_array[8]; - _mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz); - - const float A_kz = ABBC_kz_array[0]; - const float B_kz = ABBC_kz_array[2]; - const float C_kz = ABBC_kz_array[6]; - - /* By, Bx, Cy, Cx, By, Bx, Ay, Ax */ - const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy); - - const avxf neg_mask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000); - - /* W U V - * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX - */ - const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, neg_mask /* Dont care */); - - const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ neg_mask; - - /* Calculate scaled barycentric coordinates. */ - float WUVW_array[4]; - _mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW)); - - const float W = WUVW_array[0]; - const float U = WUVW_array[1]; - const float V = WUVW_array[2]; - - const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW); - const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW, - _mm256_setzero_ps(), 0)); - - if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) { - return false; - } +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex]; #else const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); - const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); - const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); - const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); - - const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz); - const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz); - const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz); - - /* Perform shear and scale of vertices. */ - const float Ax = A_kx - Sx * A_kz; - const float Ay = A_ky - Sy * A_kz; - const float Bx = B_kx - Sx * B_kz; - const float By = B_ky - Sy * B_kz; - const float Cx = C_kx - Sx * C_kz; - const float Cy = C_ky - Sy * C_kz; - - /* Calculate scaled barycentric coordinates. */ - float U = Cx * By - Cy * Bx; - float V = Ax * Cy - Ay * Cx; - float W = Bx * Ay - By * Ax; - if((U < 0.0f || V < 0.0f || W < 0.0f) && - (U > 0.0f || V > 0.0f || W > 0.0f)) - { - return false; - } #endif - - /* Calculate determinant. */ - float det = U + V + W; - if(UNLIKELY(det == 0.0f)) { - return false; - } - - /* Calculate scaled z-coordinates of vertices and use them to calculate - * the hit distance. - */ - const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz; - const int sign_det = (__float_as_int(det) & 0x80000000); - const float sign_T = xor_signmask(T, sign_det); - if((sign_T < 0.0f) || - (sign_T > isect->t * xor_signmask(det, sign_det))) + float t, u, v; + if(ray_triangle_intersect(P, + dir, + isect->t, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + ssef_verts, +#else + float4_to_float3(tri_a), + float4_to_float3(tri_b), + float4_to_float3(tri_c), +#endif + &u, &v, &t)) { - return false; - } - #ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility) + /* Visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags. + */ + if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility) #endif - { -#ifdef __KERNEL_CUDA__ - if(A == B && B == C) { - return false; + { + isect->prim = prim_addr; + isect->object = object; + isect->type = PRIMITIVE_TRIANGLE; + isect->u = u; + isect->v = v; + isect->t = t; + return true; } -#endif - /* Normalize U, V, W, and T. */ - const float inv_det = 1.0f / det; - isect->prim = prim_addr; - isect->object = object; - isect->type = PRIMITIVE_TRIANGLE; - isect->u = U * inv_det; - isect->v = V * inv_det; - isect->t = T * inv_det; - return true; } return false; } @@ -260,138 +78,37 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, #ifdef __SUBSURFACE__ ccl_device_inline void triangle_intersect_subsurface( KernelGlobals *kg, - const IsectPrecalc *isect_precalc, SubsurfaceIntersection *ss_isect, float3 P, + float3 dir, int object, int prim_addr, float tmax, uint *lcg_state, int max_hits) { - const int kx = isect_precalc->kx; - const int ky = isect_precalc->ky; - const int kz = isect_precalc->kz; - const float Sx = isect_precalc->Sx; - const float Sy = isect_precalc->Sy; - const float Sz = isect_precalc->Sz; - - /* Calculate vertices relative to ray origin. */ const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr); - const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), - tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), - tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); - -#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) - const avxf avxf_P(P.m128, P.m128); - - const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0); - const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1); - - const avxf AB = tri_ab - avxf_P; - const avxf BC = tri_bc - avxf_P; - - const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx); - - const avxf AB_k = shuffle(AB, permuteMask); - const avxf BC_k = shuffle(BC, permuteMask); - - /* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */ - const avxf ABBC_kz = shuffle<2>(AB_k, BC_k); - - /* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */ - const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k); - - const avxf Sxy(Sy, Sx, Sy, Sx); - - /* Ax, Ay, Bx, By, Bx, By, Cx, Cy */ - const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy); - - float ABBC_kz_array[8]; - _mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz); - - const float A_kz = ABBC_kz_array[0]; - const float B_kz = ABBC_kz_array[2]; - const float C_kz = ABBC_kz_array[6]; - - /* By, Bx, Cy, Cx, By, Bx, Ay, Ax */ - const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy); - - const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000); - - /* W U V - * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX - */ - const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */); - - const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask; - - /* Calculate scaled barycentric coordinates. */ - float WUVW_array[4]; - _mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW)); - - const float W = WUVW_array[0]; - const float U = WUVW_array[1]; - const float V = WUVW_array[2]; - - const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW); - const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW, - _mm256_setzero_ps(), 0)); - - if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) { - return; - } +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex]; #else - const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); - const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); - const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); - - const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz); - const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz); - const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz); - - /* Perform shear and scale of vertices. */ - const float Ax = A_kx - Sx * A_kz; - const float Ay = A_ky - Sy * A_kz; - const float Bx = B_kx - Sx * B_kz; - const float By = B_ky - Sy * B_kz; - const float Cx = C_kx - Sx * C_kz; - const float Cy = C_ky - Sy * C_kz; - - /* Calculate scaled barycentric coordinates. */ - float U = Cx * By - Cy * Bx; - float V = Ax * Cy - Ay * Cx; - float W = Bx * Ay - By * Ax; - - if((U < 0.0f || V < 0.0f || W < 0.0f) && - (U > 0.0f || V > 0.0f || W > 0.0f)) - { - return; - } + const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)), + tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)), + tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2)); #endif - - /* Calculate determinant. */ - float det = U + V + W; - if(UNLIKELY(det == 0.0f)) { - return; - } - - /* Calculate scaled z−coordinates of vertices and use them to calculate - * the hit distance. - */ - const int sign_det = (__float_as_int(det) & 0x80000000); - const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz; - const float sign_T = xor_signmask(T, sign_det); - if((sign_T < 0.0f) || - (sign_T > tmax * xor_signmask(det, sign_det))) + float t, u, v; + if(!ray_triangle_intersect(P, + dir, + tmax, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + ssef_verts, +#else + tri_a, tri_b, tri_c, +#endif + &u, &v, &t)) { return; } - /* Normalize U, V, W, and T. */ - const float inv_det = 1.0f / det; - - const float t = T * inv_det; for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) { if(ss_isect->hits[i].t == t) { return; @@ -418,18 +135,19 @@ ccl_device_inline void triangle_intersect_subsurface( isect->prim = prim_addr; isect->object = object; isect->type = PRIMITIVE_TRIANGLE; - isect->u = U * inv_det; - isect->v = V * inv_det; + isect->u = u; + isect->v = v; isect->t = t; /* Record geometric normal. */ - /* TODO(sergey): Use float4_to_float3() on just an edges. */ - const float3 v0 = float4_to_float3(tri_a); - const float3 v1 = float4_to_float3(tri_b); - const float3 v2 = float4_to_float3(tri_c); - ss_isect->Ng[hit] = normalize(cross(v1 - v0, v2 - v0)); -} +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)), + tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)), + tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2)); #endif + ss_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a)); +} +#endif /* __SUBSURFACE__ */ /* Refine triangle intersection to more precise hit point. For rays that travel * far the precision is often not so good, this reintersects the primitive from @@ -457,7 +175,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, return P; } # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; # else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); # endif @@ -491,7 +209,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; # else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); # endif @@ -519,7 +237,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; #else Transform tfm = object_fetch_transform(kg, isect->object, @@ -557,7 +275,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; #else Transform tfm = object_fetch_transform(kg, isect->object, @@ -570,6 +288,4 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, return P; } -#undef IDX - CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h index 03724c955be..1e0ef5201c9 100644 --- a/intern/cycles/kernel/geom/geom_volume.h +++ b/intern/cycles/kernel/geom/geom_volume.h @@ -64,7 +64,7 @@ ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg, ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy) { - float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P)); + float3 P = volume_normalized_position(kg, sd, sd->P); #ifdef __KERNEL_CUDA__ # if __CUDA_ARCH__ >= 300 CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset); @@ -91,7 +91,7 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy) { - float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P)); + float3 P = volume_normalized_position(kg, sd, sd->P); #ifdef __KERNEL_CUDA__ # if __CUDA_ARCH__ >= 300 CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset); diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 9279a94c13a..06c0fb2fbca 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -19,7 +19,8 @@ /* CPU Kernel Interface */ -#include "util_types.h" +#include "util/util_types.h" +#include "kernel/kernel_types.h" CCL_NAMESPACE_BEGIN @@ -28,6 +29,7 @@ CCL_NAMESPACE_BEGIN #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) struct KernelGlobals; +struct KernelData; KernelGlobals *kernel_globals_create(); void kernel_globals_free(KernelGlobals *kg); @@ -46,31 +48,31 @@ void kernel_tex_copy(KernelGlobals *kg, ExtensionType extension = EXTENSION_REPEAT); #define KERNEL_ARCH cpu -#include "kernels/cpu/kernel_cpu.h" +#include "kernel/kernels/cpu/kernel_cpu.h" #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 # define KERNEL_ARCH cpu_sse2 -# include "kernels/cpu/kernel_cpu.h" +# include "kernel/kernels/cpu/kernel_cpu.h" #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 # define KERNEL_ARCH cpu_sse3 -# include "kernels/cpu/kernel_cpu.h" +# include "kernel/kernels/cpu/kernel_cpu.h" #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 # define KERNEL_ARCH cpu_sse41 -# include "kernels/cpu/kernel_cpu.h" +# include "kernel/kernels/cpu/kernel_cpu.h" #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX # define KERNEL_ARCH cpu_avx -# include "kernels/cpu/kernel_cpu.h" +# include "kernel/kernels/cpu/kernel_cpu.h" #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 # define KERNEL_ARCH cpu_avx2 -# include "kernels/cpu/kernel_cpu.h" +# include "kernel/kernels/cpu/kernel_cpu.h" #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index 6c3ee6b8098..823d30dde78 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -52,10 +52,17 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v { eval->diffuse = value; } +#ifdef __SHADOW_TRICKS__ + eval->sum_no_mis = make_float3(0.0f, 0.0f, 0.0f); +#endif } -ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value) +ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value, float mis_weight) { +#ifdef __SHADOW_TRICKS__ + eval->sum_no_mis += value; +#endif + value *= mis_weight; #ifdef __PASSES__ if(eval->use_light_pass) { if(CLOSURE_IS_BSDF_DIFFUSE(type)) @@ -96,7 +103,7 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval) } } -ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value) +ccl_device_inline void bsdf_eval_mis(BsdfEval *eval, float value) { #ifdef __PASSES__ if(eval->use_light_pass) { @@ -115,8 +122,19 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value) } } +ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value) +{ +#ifdef __SHADOW_TRICKS__ + eval->sum_no_mis *= value; +#endif + bsdf_eval_mis(eval, value); +} + ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value) { +#ifdef __SHADOW_TRICKS__ + eval->sum_no_mis *= value; +#endif #ifdef __PASSES__ if(eval->use_light_pass) { eval->diffuse *= value; @@ -134,7 +152,7 @@ ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value) #endif } -ccl_device_inline float3 bsdf_eval_sum(BsdfEval *eval) +ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval) { #ifdef __PASSES__ if(eval->use_light_pass) { @@ -198,6 +216,12 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) { L->emission = make_float3(0.0f, 0.0f, 0.0f); } + +#ifdef __SHADOW_TRICKS__ + L->path_total = make_float3(0.0f, 0.0f, 0.0f); + L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_color = make_float3(0.0f, 0.0f, 0.0f); +#endif } ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput, @@ -252,7 +276,12 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro } } -ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput, float3 alpha, float3 bsdf, float3 ao, int bounce) +ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, + float3 throughput, + float3 alpha, + float3 bsdf, + float3 ao, + int bounce) { #ifdef __PASSES__ if(L->use_light_pass) { @@ -271,6 +300,26 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput { L->emission += throughput*bsdf*ao; } + +#ifdef __SHADOW_TRICKS__ + float3 light = throughput * bsdf; + L->path_total += light; + L->path_total_shaded += ao * light; +#endif +} + +ccl_device_inline void path_radiance_accum_total_ao( + PathRadiance *L, + float3 throughput, + float3 bsdf) +{ +#ifdef __SHADOW_TRICKS__ + L->path_total += throughput * bsdf; +#else + (void) L; + (void) throughput; + (void) bsdf; +#endif } ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp) @@ -301,15 +350,38 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through { L->emission += throughput*bsdf_eval->diffuse*shadow; } + +#ifdef __SHADOW_TRICKS__ + float3 light = throughput * bsdf_eval->sum_no_mis; + L->path_total += light; + L->path_total_shaded += shadow * light; +#endif +} + +ccl_device_inline void path_radiance_accum_total_light( + PathRadiance *L, + float3 throughput, + const BsdfEval *bsdf_eval) +{ +#ifdef __SHADOW_TRICKS__ + L->path_total += throughput * bsdf_eval->sum_no_mis; +#else + (void) L; + (void) throughput; + (void) bsdf_eval; +#endif } -ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 throughput, float3 value, int bounce) +ccl_device_inline void path_radiance_accum_background(PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + float3 value) { #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) + if(state->bounce == 0) L->background += throughput*value; - else if(bounce == 1) + else if(state->bounce == 1) L->direct_emission += throughput*value; else L->indirect += throughput*value; @@ -319,6 +391,13 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 th { L->emission += throughput*value; } + +#ifdef __SHADOW_TRICKS__ + L->path_total += throughput * value; + if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) { + L->path_total_shaded += throughput * value; + } +#endif } ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L) @@ -399,7 +478,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z); /* Reject invalid value */ - if(!isfinite(sum)) { + if(!isfinite_safe(sum)) { kernel_assert(!"Non-finite sum in path_radiance_clamp_and_sum!"); L_sum = make_float3(0.0f, 0.0f, 0.0f); @@ -468,7 +547,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi /* Reject invalid value */ float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z); - if(!isfinite(sum)) { + if(!isfinite_safe(sum)) { kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!"); L_sum = make_float3(0.0f, 0.0f, 0.0f); } @@ -501,5 +580,34 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance L->emission += L_sample->emission * fac; } -CCL_NAMESPACE_END +#ifdef __SHADOW_TRICKS__ +/* Calculate current shadow of the path. */ +ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L) +{ + float path_total = average(L->path_total); + float path_total_shaded = average(L->path_total_shaded); + if(path_total != 0.0f) { + return path_total_shaded / path_total; + } + return 1.0f; +} +/* Calculate final light sum and transparency for shadow catcher object. */ +ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg, + const PathRadiance *L, + ccl_addr_space float* L_transparent) +{ + const float shadow = path_radiance_sum_shadow(L); + float3 L_sum; + if(kernel_data.background.transparent) { + *L_transparent = shadow; + L_sum = make_float3(0.0f, 0.0f, 0.0f); + } + else { + L_sum = L->shadow_color * shadow; + } + return L_sum; +} +#endif + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index 5bcc57cdcdf..f18d145f7cf 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -54,7 +54,8 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF); shader_eval_surface(kg, sd, &rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN); - /* TODO, disable the closures we won't need */ + /* TODO, disable more closures we don't need besides transparent */ + shader_bsdf_disable_transparency(kg, sd); #ifdef __BRANCHED_PATH__ if(!kernel_data.integrator.branched) { diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h index dedac6b1465..0df5217d97a 100644 --- a/intern/cycles/kernel/kernel_camera.h +++ b/intern/cycles/kernel/kernel_camera.h @@ -457,7 +457,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, { if(kernel_data.cam.type != CAMERA_PANORAMA) { /* perspective / ortho */ - if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE) + if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE) P += camera_position(kg); Transform tfm = kernel_data.cam.worldtondc; @@ -467,7 +467,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, /* panorama */ Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) P = normalize(transform_point(&tfm, P)); else P = normalize(transform_direction(&tfm, P)); diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index 9d1f3bdc918..ae7c9b836c4 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -35,15 +35,24 @@ # define __NODES_FEATURES__ NODE_FEATURE_ALL #endif -#include "util_debug.h" -#include "util_math.h" -#include "util_simd.h" -#include "util_half.h" -#include "util_types.h" -#include "util_texture.h" +#include "util/util_debug.h" +#include "util/util_math.h" +#include "util/util_simd.h" +#include "util/util_half.h" +#include "util/util_types.h" +#include "util/util_texture.h" #define ccl_addr_space +#define ccl_local_id(d) 0 +#define ccl_global_id(d) (kg->global_id[d]) + +#define ccl_local_size(d) 1 +#define ccl_global_size(d) (kg->global_size[d]) + +#define ccl_group_id(d) ccl_global_id(d) +#define ccl_num_groups(d) ccl_global_size(d) + /* On x86_64, versions of glibc < 2.16 have an issue where expf is * much slower than the double version. This was fixed in glibc 2.16. */ @@ -78,9 +87,9 @@ template<typename T> struct texture { ccl_always_inline avxf fetch_avxf(const int index) { kernel_assert(index >= 0 && (index+1) < width); - ssef *ssefData = (ssef*)data; - ssef *ssefNodeData = &ssefData[index]; - return _mm256_loadu_ps((float *)ssefNodeData); + ssef *ssef_data = (ssef*)data; + ssef *ssef_node_data = &ssef_data[index]; + return _mm256_loadu_ps((float *)ssef_node_data); } #endif diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index e0c7b17c6a0..39e98c7dda6 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -38,7 +38,7 @@ #define ccl_device __device__ __inline__ # define ccl_device_forceinline __device__ __forceinline__ -#if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500) +#if __CUDA_ARCH__ < 500 # define ccl_device_inline __device__ __forceinline__ #else # define ccl_device_inline __device__ __inline__ @@ -46,6 +46,9 @@ #define ccl_device_noinline __device__ __noinline__ #define ccl_global #define ccl_constant +#define ccl_local __shared__ +#define ccl_local_param +#define ccl_private #define ccl_may_alias #define ccl_addr_space #define ccl_restrict __restrict__ @@ -57,8 +60,54 @@ /* Types */ -#include "util_half.h" -#include "util_types.h" +#include "util/util_half.h" +#include "util/util_types.h" + +/* Work item functions */ + +ccl_device_inline uint ccl_local_id(uint d) +{ + switch(d) { + case 0: return threadIdx.x; + case 1: return threadIdx.y; + case 2: return threadIdx.z; + default: return 0; + } +} + +#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d)) + +ccl_device_inline uint ccl_local_size(uint d) +{ + switch(d) { + case 0: return blockDim.x; + case 1: return blockDim.y; + case 2: return blockDim.z; + default: return 0; + } +} + +#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d)) + +ccl_device_inline uint ccl_group_id(uint d) +{ + switch(d) { + case 0: return blockIdx.x; + case 1: return blockIdx.y; + case 2: return blockIdx.z; + default: return 0; + } +} + +ccl_device_inline uint ccl_num_groups(uint d) +{ + switch(d) { + case 0: return gridDim.x; + case 1: return gridDim.y; + case 2: return gridDim.z; + default: return 0; + } +} /* Textures */ diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index f076e3a7d37..c2263ac0d49 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -39,6 +39,7 @@ #define ccl_constant __constant #define ccl_global __global #define ccl_local __local +#define ccl_local_param __local #define ccl_private __private #define ccl_restrict restrict #define ccl_align(n) __attribute__((aligned(n))) @@ -49,6 +50,15 @@ # define ccl_addr_space #endif +#define ccl_local_id(d) get_local_id(d) +#define ccl_global_id(d) get_global_id(d) + +#define ccl_local_size(d) get_local_size(d) +#define ccl_global_size(d) get_global_size(d) + +#define ccl_group_id(d) get_group_id(d) +#define ccl_num_groups(d) get_num_groups(d) + /* Selective nodes compilation. */ #ifndef __NODES_MAX_GROUP__ # define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX @@ -133,8 +143,8 @@ /* define NULL */ #define NULL 0 -#include "util_half.h" -#include "util_types.h" +#include "util/util_half.h" +#include "util/util_types.h" #endif /* __KERNEL_COMPAT_OPENCL_H__ */ diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index 8c7c651a053..9e7d51f23f5 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -67,7 +67,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, false, ls->lamp); - ls->Ng = ccl_fetch(emission_sd, Ng); + ls->Ng = emission_sd->Ng; /* no path flag, we're evaluating this for all closures. that's weak but * we'd have to do multiple evaluations otherwise */ @@ -76,7 +76,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, path_state_modify_bounce(state, false); /* evaluate emissive closure */ - if(ccl_fetch(emission_sd, flag) & SD_EMISSION) + if(emission_sd->flag & SD_EMISSION) eval = shader_emissive_eval(kg, emission_sd); else eval = make_float3(0.0f, 0.0f, 0.0f); @@ -112,7 +112,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, -ls->D, dD, ls->t, - ccl_fetch(sd, time)); + sd->time); if(is_zero(light_eval)) return false; @@ -120,7 +120,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, /* evaluate BSDF at shading point */ #ifdef __VOLUME__ - if(ccl_fetch(sd, prim) != PRIM_NONE) + if(sd->prim != PRIM_NONE) shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS); else { float bsdf_pdf; @@ -156,8 +156,13 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, if(bsdf_eval_is_zero(eval)) return false; - if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - float probability = max3(bsdf_eval_sum(eval)) * kernel_data.integrator.light_inv_rr_threshold; + if(kernel_data.integrator.light_inv_rr_threshold > 0.0f +#ifdef __SHADOW_TRICKS__ + && (state->flag & PATH_RAY_SHADOW_CATCHER) == 0 +#endif + ) + { + float probability = max3(fabs(bsdf_eval_sum(eval))) * kernel_data.integrator.light_inv_rr_threshold; if(probability < 1.0f) { if(rand_terminate >= probability) { return false; @@ -168,8 +173,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, if(ls->shader & SHADER_CAST_SHADOW) { /* setup ray */ - bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f); - ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); + bool transmit = (dot(sd->Ng, ls->D) < 0.0f); + ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng); if(ls->t == FLT_MAX) { /* distant light */ @@ -182,7 +187,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ray->D = normalize_len(ray->D, &ray->t); } - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; ray->dD = differential3_zero(); } else { @@ -204,14 +209,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader float3 L = shader_emissive_eval(kg, sd); #ifdef __HAIR__ - if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)) + if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE)) #else - if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS)) + if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS)) #endif { /* multiple importance sampling, get triangle light pdf, * and compute weight with respect to BSDF pdf */ - float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t); + float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t); float mis_weight = power_heuristic(bsdf_pdf, pdf); return L*mis_weight; diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index 8e66a3a0340..c9c97ea977e 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -16,6 +16,9 @@ /* Constant Globals */ +#ifndef __KERNEL_GLOBALS_H__ +#define __KERNEL_GLOBALS_H__ + CCL_NAMESPACE_BEGIN /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in @@ -44,7 +47,7 @@ typedef struct KernelGlobals { # define KERNEL_TEX(type, ttype, name) ttype name; # define KERNEL_IMAGE_TEX(type, ttype, name) -# include "kernel_textures.h" +# include "kernel/kernel_textures.h" KernelData __data; @@ -64,6 +67,13 @@ typedef struct KernelGlobals { /* Storage for decoupled volume steps. */ VolumeStep *decoupled_volume_steps[2]; int decoupled_volume_steps_index; + + /* split kernel */ + SplitData split_data; + SplitParams split_param_data; + + int2 global_size; + int2 global_id; } KernelGlobals; #endif /* __KERNEL_CPU__ */ @@ -76,7 +86,10 @@ typedef struct KernelGlobals { #ifdef __KERNEL_CUDA__ __constant__ KernelData __data; -typedef struct KernelGlobals {} KernelGlobals; +typedef struct KernelGlobals { + /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */ + Intersection hits_stack[64]; +} KernelGlobals; # ifdef __KERNEL_CUDA_TEX_STORAGE__ # define KERNEL_TEX(type, ttype, name) ttype name; @@ -84,7 +97,7 @@ typedef struct KernelGlobals {} KernelGlobals; # define KERNEL_TEX(type, ttype, name) const __constant__ __device__ type *name; # endif # define KERNEL_IMAGE_TEX(type, ttype, name) ttype name; -# include "kernel_textures.h" +# include "kernel/kernel_textures.h" #endif /* __KERNEL_CUDA__ */ @@ -97,11 +110,11 @@ typedef ccl_addr_space struct KernelGlobals { # define KERNEL_TEX(type, ttype, name) \ ccl_global type *name; -# include "kernel_textures.h" +# include "kernel/kernel_textures.h" # ifdef __SPLIT_KERNEL__ - ShaderData *sd_input; - Intersection *isect_shadow; + SplitData split_data; + SplitParams split_param_data; # endif } KernelGlobals; @@ -143,3 +156,4 @@ ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int o CCL_NAMESPACE_END +#endif /* __KERNEL_GLOBALS_H__ */ diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h index 9bee5603474..bd0e23b7705 100644 --- a/intern/cycles/kernel/kernel_math.h +++ b/intern/cycles/kernel/kernel_math.h @@ -17,11 +17,11 @@ #ifndef __KERNEL_MATH_H__ #define __KERNEL_MATH_H__ -#include "util_color.h" -#include "util_math.h" -#include "util_math_fast.h" -#include "util_texture.h" -#include "util_transform.h" +#include "util/util_color.h" +#include "util/util_math.h" +#include "util/util_math_fast.h" +#include "util/util_math_intersect.h" +#include "util/util_texture.h" +#include "util/util_transform.h" #endif /* __KERNEL_MATH_H__ */ - diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 7aec47e4957..ed523696571 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -19,16 +19,16 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value) { ccl_global float *buf = buffer; -#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) +#if defined(__SPLIT_KERNEL__) atomic_add_and_fetch_float(buf, value); #else *buf = (sample == 0)? value: *buf + value; -#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ +#endif /* __SPLIT_KERNEL__ */ } ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value) { -#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) +#if defined(__SPLIT_KERNEL__) ccl_global float *buf_x = buffer + 0; ccl_global float *buf_y = buffer + 1; ccl_global float *buf_z = buffer + 2; @@ -39,12 +39,12 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sa #else ccl_global float3 *buf = (ccl_global float3*)buffer; *buf = (sample == 0)? value: *buf + value; -#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ +#endif /* __SPLIT_KERNEL__ */ } ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value) { -#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) +#if defined(__SPLIT_KERNEL__) ccl_global float *buf_x = buffer + 0; ccl_global float *buf_y = buffer + 1; ccl_global float *buf_z = buffer + 2; @@ -57,7 +57,7 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa #else ccl_global float4 *buf = (ccl_global float4*)buffer; *buf = (sample == 0)? value: *buf + value; -#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ +#endif /* __SPLIT_KERNEL__ */ } ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, @@ -75,18 +75,18 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl return; if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) { - if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) || + if(!(sd->flag & SD_TRANSPARENT) || kernel_data.film.pass_alpha_threshold == 0.0f || average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) { if(sample == 0) { if(flag & PASS_DEPTH) { - float depth = camera_distance(kg, ccl_fetch(sd, P)); + float depth = camera_distance(kg, sd->P); kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth); } if(flag & PASS_OBJECT_ID) { - float id = object_pass_id(kg, ccl_fetch(sd, object)); + float id = object_pass_id(kg, sd->object); kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id); } if(flag & PASS_MATERIAL_ID) { @@ -96,7 +96,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl } if(flag & PASS_NORMAL) { - float3 normal = ccl_fetch(sd, N); + float3 normal = sd->N; kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal); } if(flag & PASS_UV) { @@ -127,7 +127,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl float mist_start = kernel_data.film.mist_start; float mist_inv_depth = kernel_data.film.mist_inv_depth; - float depth = camera_distance(kg, ccl_fetch(sd, P)); + float depth = camera_distance(kg, sd->P); float mist = saturate((depth - mist_start)*mist_inv_depth); /* falloff */ diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index f90701a8260..e7957042182 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -15,40 +15,41 @@ */ #ifdef __OSL__ -# include "osl_shader.h" +# include "kernel/osl/osl_shader.h" #endif -#include "kernel_random.h" -#include "kernel_projection.h" -#include "kernel_montecarlo.h" -#include "kernel_differential.h" -#include "kernel_camera.h" +#include "kernel/kernel_random.h" +#include "kernel/kernel_projection.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_differential.h" +#include "kernel/kernel_camera.h" -#include "geom/geom.h" -#include "bvh/bvh.h" +#include "kernel/geom/geom.h" +#include "kernel/bvh/bvh.h" -#include "kernel_accumulate.h" -#include "kernel_shader.h" -#include "kernel_light.h" -#include "kernel_passes.h" +#include "kernel/kernel_accumulate.h" +#include "kernel/kernel_shader.h" +#include "kernel/kernel_light.h" +#include "kernel/kernel_passes.h" #ifdef __SUBSURFACE__ -# include "kernel_subsurface.h" +# include "kernel/kernel_subsurface.h" #endif #ifdef __VOLUME__ -# include "kernel_volume.h" +# include "kernel/kernel_volume.h" #endif -#include "kernel_path_state.h" -#include "kernel_shadow.h" -#include "kernel_emission.h" -#include "kernel_path_common.h" -#include "kernel_path_surface.h" -#include "kernel_path_volume.h" +#include "kernel/kernel_path_state.h" +#include "kernel/kernel_shadow.h" +#include "kernel/kernel_emission.h" +#include "kernel/kernel_path_common.h" +#include "kernel/kernel_path_surface.h" +#include "kernel/kernel_path_volume.h" +#include "kernel/kernel_path_subsurface.h" #ifdef __KERNEL_DEBUG__ -# include "kernel_debug.h" +# include "kernel/kernel_debug.h" #endif CCL_NAMESPACE_BEGIN @@ -75,22 +76,25 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { + if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { Ray light_ray; float3 ao_shadow; - light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); + light_ray.P = ray_offset(sd->P, sd->Ng); light_ray.D = ao_D; light_ray.t = kernel_data.background.ao_distance; #ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); + light_ray.time = sd->time; #endif /* __OBJECT_MOTION__ */ - light_ray.dP = ccl_fetch(sd, dP); + light_ray.dP = sd->dP; light_ray.dD = differential3_zero(); if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce); } + else { + path_radiance_accum_total_ao(L, throughput, ao_bsdf); + } } } @@ -289,9 +293,9 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, /* sample background shader */ float3 L_background = indirect_background(kg, emission_sd, state, ray); path_radiance_accum_background(L, + state, throughput, - L_background, - state->bounce); + L_background); #endif /* __BACKGROUND__ */ break; @@ -311,6 +315,12 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, shader_merge_closures(sd); #endif /* __BRANCHED_PATH__ */ +#ifdef __SHADOW_TRICKS__ + if(!(sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) { + state->flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY; + } +#endif /* __SHADOW_TRICKS__ */ + /* blurring of bsdf after bounces, for rays that have a small likelihood * of following this particular path (diffuse, rough glossy) */ if(kernel_data.integrator.filter_glossy != FLT_MAX) { @@ -373,7 +383,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, /* do bssrdf scatter step if we picked a bssrdf closure */ if(sc) { - uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb); + uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb); float bssrdf_u, bssrdf_v; path_state_rng_2D(kg, @@ -395,7 +405,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, #if defined(__EMISSION__) && defined(__BRANCHED_PATH__) if(kernel_data.integrator.use_direct_light) { - int all = kernel_data.integrator.sample_all_lights_indirect; + int all = (kernel_data.integrator.sample_all_lights_indirect) || + (state->flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light(kg, rng, sd, @@ -413,172 +424,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, } } -#ifdef __SUBSURFACE__ -# ifndef __KERNEL_CUDA__ -ccl_device -# else -ccl_device_inline -# endif -bool kernel_path_subsurface_scatter( - KernelGlobals *kg, - ShaderData *sd, - ShaderData *emission_sd, - PathRadiance *L, - PathState *state, - RNG *rng, - Ray *ray, - float3 *throughput, - SubsurfaceIndirectRays *ss_indirect) -{ - float bssrdf_probability; - ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); - - /* modify throughput for picking bssrdf or bsdf */ - *throughput *= bssrdf_probability; - - /* do bssrdf scatter step if we picked a bssrdf closure */ - if(sc) { - /* We should never have two consecutive BSSRDF bounces, - * the second one should be converted to a diffuse BSDF to - * avoid this. - */ - kernel_assert(!ss_indirect->tracing); - - uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb); - - SubsurfaceIntersection ss_isect; - float bssrdf_u, bssrdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); - int num_hits = subsurface_scatter_multi_intersect(kg, - &ss_isect, - sd, - sc, - &lcg_state, - bssrdf_u, bssrdf_v, - false); -# ifdef __VOLUME__ - ss_indirect->need_update_volume_stack = - kernel_data.integrator.use_volumes && - ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME; -# endif /* __VOLUME__ */ - - /* compute lighting with the BSDF closure */ - for(int hit = 0; hit < num_hits; hit++) { - /* NOTE: We reuse the existing ShaderData, we assume the path - * integration loop stops when this function returns true. - */ - subsurface_scatter_multi_setup(kg, - &ss_isect, - hit, - sd, - state, - state->flag, - sc, - false); - - PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays]; - Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays]; - float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays]; - PathRadiance *hit_L = &ss_indirect->L[ss_indirect->num_rays]; - - *hit_state = *state; - *hit_ray = *ray; - *hit_tp = *throughput; - - hit_state->rng_offset += PRNG_BOUNCE_NUM; - - path_radiance_init(hit_L, kernel_data.film.use_light_pass); - hit_L->direct_throughput = L->direct_throughput; - path_radiance_copy_indirect(hit_L, L); - - kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L); - - if(kernel_path_surface_bounce(kg, - rng, - sd, - hit_tp, - hit_state, - hit_L, - hit_ray)) - { -# ifdef __LAMP_MIS__ - hit_state->ray_t = 0.0f; -# endif /* __LAMP_MIS__ */ - -# ifdef __VOLUME__ - if(ss_indirect->need_update_volume_stack) { - Ray volume_ray = *ray; - /* Setup ray from previous surface point to the new one. */ - volume_ray.D = normalize_len(hit_ray->P - volume_ray.P, - &volume_ray.t); - - kernel_volume_stack_update_for_subsurface( - kg, - emission_sd, - &volume_ray, - hit_state->volume_stack); - } -# endif /* __VOLUME__ */ - path_radiance_reset_indirect(L); - ss_indirect->num_rays++; - } - else { - path_radiance_accum_sample(L, hit_L, 1); - } - } - return true; - } - return false; -} - -ccl_device_inline void kernel_path_subsurface_init_indirect( - SubsurfaceIndirectRays *ss_indirect) -{ - ss_indirect->tracing = false; - ss_indirect->num_rays = 0; -} - -ccl_device void kernel_path_subsurface_accum_indirect( - SubsurfaceIndirectRays *ss_indirect, - PathRadiance *L) -{ - if(ss_indirect->tracing) { - path_radiance_sum_indirect(L); - path_radiance_accum_sample(&ss_indirect->direct_L, L, 1); - if(ss_indirect->num_rays == 0) { - *L = ss_indirect->direct_L; - } - } -} - -ccl_device void kernel_path_subsurface_setup_indirect( - KernelGlobals *kg, - SubsurfaceIndirectRays *ss_indirect, - PathState *state, - Ray *ray, - PathRadiance *L, - float3 *throughput) -{ - if(!ss_indirect->tracing) { - ss_indirect->direct_L = *L; - } - ss_indirect->tracing = true; - - /* Setup state, ray and throughput for indirect SSS rays. */ - ss_indirect->num_rays--; - - Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays]; - PathRadiance *indirect_L = &ss_indirect->L[ss_indirect->num_rays]; - - *state = ss_indirect->state[ss_indirect->num_rays]; - *ray = *indirect_ray; - *L = *indirect_L; - *throughput = ss_indirect->throughputs[ss_indirect->num_rays]; - - state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM; -} - -#endif /* __SUBSURFACE__ */ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, @@ -631,7 +476,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, } extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(rng, &state, 0x51633e2d); + lcg_state = lcg_state_init(rng, state.rng_offset, state.sample, 0x51633e2d); } if(state.bounce > kernel_data.integrator.ao_bounces) { @@ -776,7 +621,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #ifdef __BACKGROUND__ /* sample background shader */ float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(&L, throughput, L_background, state.bounce); + path_radiance_accum_background(&L, &state, throughput, L_background); #endif /* __BACKGROUND__ */ break; @@ -790,6 +635,21 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF); shader_eval_surface(kg, &sd, rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN); +#ifdef __SHADOW_TRICKS__ + if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) { + if(state.flag & PATH_RAY_CAMERA) { + state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY); + state.catcher_object = sd.object; + if(!kernel_data.background.transparent) { + L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray); + } + } + } + else { + state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY; + } +#endif /* __SHADOW_TRICKS__ */ + /* holdout */ #ifdef __HOLDOUT__ if(((sd.flag & SD_HOLDOUT) || @@ -907,7 +767,16 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ - float3 L_sum = path_radiance_clamp_and_sum(kg, &L); + float3 L_sum; +#ifdef __SHADOW_TRICKS__ + if(state.flag & PATH_RAY_SHADOW_CATCHER) { + L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent); + } + else +#endif /* __SHADOW_TRICKS__ */ + { + L_sum = path_radiance_clamp_and_sum(kg, &L); + } kernel_write_light_passes(kg, buffer, &L, sample); diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index ff2b828795d..36fd6c95fe7 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -42,21 +42,25 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { + if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { Ray light_ray; float3 ao_shadow; - light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); + light_ray.P = ray_offset(sd->P, sd->Ng); light_ray.D = ao_D; light_ray.t = kernel_data.background.ao_distance; #ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); + light_ray.time = sd->time; #endif /* __OBJECT_MOTION__ */ - light_ray.dP = ccl_fetch(sd, dP); + light_ray.dP = sd->dP; light_ray.dD = differential3_zero(); - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) + if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + } + else { + path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf); + } } } } @@ -67,8 +71,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd, float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - const ShaderClosure *sc = &ccl_fetch(sd, closure)[i]; + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; if(!CLOSURE_IS_BSDF(sc->type)) continue; @@ -140,14 +144,14 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, Ray *ray, float3 throughput) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = &ccl_fetch(sd, closure)[i]; + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(!CLOSURE_IS_BSSRDF(sc->type)) continue; /* set up random number generator */ - uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb); + uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb); int num_samples = kernel_data.integrator.subsurface_samples; float num_samples_inv = 1.0f/num_samples; RNG bssrdf_rng = cmj_hash(*rng, i); @@ -169,7 +173,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, Ray volume_ray = *ray; bool need_update_volume_stack = kernel_data.integrator.use_volumes && - ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME; + sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; #endif /* __VOLUME__ */ /* compute lighting with the BSDF closure */ @@ -206,7 +210,8 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, #ifdef __EMISSION__ /* direct light */ if(kernel_data.integrator.use_direct_light) { - int all = kernel_data.integrator.sample_all_lights_direct; + int all = (kernel_data.integrator.sample_all_lights_direct) || + (state->flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light( kg, rng, @@ -280,7 +285,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in } extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(rng, &state, 0x51633e2d); + lcg_state = lcg_state_init(rng, state.rng_offset, state.sample, 0x51633e2d); } bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax); @@ -461,7 +466,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #ifdef __BACKGROUND__ /* sample background shader */ float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(&L, throughput, L_background, state.bounce); + path_radiance_accum_background(&L, &state, throughput, L_background); #endif /* __BACKGROUND__ */ break; @@ -472,6 +477,21 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in shader_eval_surface(kg, &sd, rng, &state, 0.0f, state.flag, SHADER_CONTEXT_MAIN); shader_merge_closures(&sd); +#ifdef __SHADOW_TRICKS__ + if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) { + if(state.flag & PATH_RAY_CAMERA) { + state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY); + state.catcher_object = sd.object; + if(!kernel_data.background.transparent) { + L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray); + } + } + } + else { + state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY; + } +#endif /* __SHADOW_TRICKS__ */ + /* holdout */ #ifdef __HOLDOUT__ if((sd.flag & SD_HOLDOUT) || (sd.object_flag & SD_OBJECT_HOLDOUT_MASK)) { @@ -544,7 +564,8 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #ifdef __EMISSION__ /* direct light */ if(kernel_data.integrator.use_direct_light) { - int all = kernel_data.integrator.sample_all_lights_direct; + int all = (kernel_data.integrator.sample_all_lights_direct) || + (state.flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light(kg, rng, &sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all); } @@ -581,7 +602,16 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #endif /* __VOLUME__ */ } - float3 L_sum = path_radiance_clamp_and_sum(kg, &L); + float3 L_sum; +#ifdef __SHADOW_TRICKS__ + if(state.flag & PATH_RAY_SHADOW_CATCHER) { + L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent); + } + else +#endif /* __SHADOW_TRICKS__ */ + { + L_sum = path_radiance_clamp_and_sum(kg, &L); + } kernel_write_light_passes(kg, buffer, &L, sample); diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h index 7b903556bf9..82f83deb595 100644 --- a/intern/cycles/kernel/kernel_path_common.h +++ b/intern/cycles/kernel/kernel_path_common.h @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "util_hash.h" +#include "util/util_hash.h" CCL_NAMESPACE_BEGIN @@ -22,7 +22,7 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int x, int y, - ccl_addr_space RNG *rng, + RNG *rng, ccl_addr_space Ray *ray) { float filter_u; diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index 661dc52fb31..c0cd2a63120 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -19,7 +19,7 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void path_state_init(KernelGlobals *kg, ShaderData *stack_sd, ccl_addr_space PathState *state, - ccl_addr_space RNG *rng, + RNG *rng, int sample, ccl_addr_space Ray *ray) { @@ -54,6 +54,10 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, state->volume_stack[0].shader = SHADER_NONE; } #endif + +#ifdef __SHADOW_TRICKS__ + state->catcher_object = OBJECT_NONE; +#endif } ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathState *state, int label) diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h new file mode 100644 index 00000000000..10b568ac3dd --- /dev/null +++ b/intern/cycles/kernel/kernel_path_subsurface.h @@ -0,0 +1,187 @@ +/* + * Copyright 2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#ifdef __SUBSURFACE__ +# ifndef __KERNEL_CUDA__ +ccl_device +# else +ccl_device_inline +# endif +bool kernel_path_subsurface_scatter( + KernelGlobals *kg, + ShaderData *sd, + ShaderData *emission_sd, + PathRadiance *L, + ccl_addr_space PathState *state, + RNG *rng, + ccl_addr_space Ray *ray, + ccl_addr_space float3 *throughput, + ccl_addr_space SubsurfaceIndirectRays *ss_indirect) +{ + float bssrdf_probability; + ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); + + /* modify throughput for picking bssrdf or bsdf */ + *throughput *= bssrdf_probability; + + /* do bssrdf scatter step if we picked a bssrdf closure */ + if(sc) { + /* We should never have two consecutive BSSRDF bounces, + * the second one should be converted to a diffuse BSDF to + * avoid this. + */ + kernel_assert(!ss_indirect->tracing); + + uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb); + + SubsurfaceIntersection ss_isect; + float bssrdf_u, bssrdf_v; + path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); + int num_hits = subsurface_scatter_multi_intersect(kg, + &ss_isect, + sd, + sc, + &lcg_state, + bssrdf_u, bssrdf_v, + false); +# ifdef __VOLUME__ + ss_indirect->need_update_volume_stack = + kernel_data.integrator.use_volumes && + sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; +# endif /* __VOLUME__ */ + + /* compute lighting with the BSDF closure */ + for(int hit = 0; hit < num_hits; hit++) { + /* NOTE: We reuse the existing ShaderData, we assume the path + * integration loop stops when this function returns true. + */ + subsurface_scatter_multi_setup(kg, + &ss_isect, + hit, + sd, + state, + state->flag, + sc, + false); + + ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays]; + ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays]; + ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays]; + PathRadiance *hit_L = &ss_indirect->L[ss_indirect->num_rays]; + + *hit_state = *state; + *hit_ray = *ray; + *hit_tp = *throughput; + + hit_state->rng_offset += PRNG_BOUNCE_NUM; + + path_radiance_init(hit_L, kernel_data.film.use_light_pass); + hit_L->direct_throughput = L->direct_throughput; + path_radiance_copy_indirect(hit_L, L); + + kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L); + + if(kernel_path_surface_bounce(kg, + rng, + sd, + hit_tp, + hit_state, + hit_L, + hit_ray)) + { +# ifdef __LAMP_MIS__ + hit_state->ray_t = 0.0f; +# endif /* __LAMP_MIS__ */ + +# ifdef __VOLUME__ + if(ss_indirect->need_update_volume_stack) { + Ray volume_ray = *ray; + /* Setup ray from previous surface point to the new one. */ + volume_ray.D = normalize_len(hit_ray->P - volume_ray.P, + &volume_ray.t); + + kernel_volume_stack_update_for_subsurface( + kg, + emission_sd, + &volume_ray, + hit_state->volume_stack); + } +# endif /* __VOLUME__ */ + path_radiance_reset_indirect(L); + ss_indirect->num_rays++; + } + else { + path_radiance_accum_sample(L, hit_L, 1); + } + } + return true; + } + return false; +} + +ccl_device_inline void kernel_path_subsurface_init_indirect( + ccl_addr_space SubsurfaceIndirectRays *ss_indirect) +{ + ss_indirect->tracing = false; + ss_indirect->num_rays = 0; +} + +ccl_device void kernel_path_subsurface_accum_indirect( + ccl_addr_space SubsurfaceIndirectRays *ss_indirect, + PathRadiance *L) +{ + if(ss_indirect->tracing) { + path_radiance_sum_indirect(L); + path_radiance_accum_sample(&ss_indirect->direct_L, L, 1); + if(ss_indirect->num_rays == 0) { + *L = ss_indirect->direct_L; + } + } +} + +ccl_device void kernel_path_subsurface_setup_indirect( + KernelGlobals *kg, + ccl_addr_space SubsurfaceIndirectRays *ss_indirect, + ccl_addr_space PathState *state, + ccl_addr_space Ray *ray, + PathRadiance *L, + ccl_addr_space float3 *throughput) +{ + if(!ss_indirect->tracing) { + ss_indirect->direct_L = *L; + } + ss_indirect->tracing = true; + + /* Setup state, ray and throughput for indirect SSS rays. */ + ss_indirect->num_rays--; + + ccl_addr_space Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays]; + PathRadiance *indirect_L = &ss_indirect->L[ss_indirect->num_rays]; + + *state = ss_indirect->state[ss_indirect->num_rays]; + *ray = *indirect_ray; + *L = *indirect_L; + *throughput = ss_indirect->throughputs[ss_indirect->num_rays]; + + state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM; +} + +#endif /* __SUBSURFACE__ */ + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index fea503d06e5..076c82f3853 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -16,16 +16,22 @@ CCL_NAMESPACE_BEGIN -#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) - +#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) /* branched path tracing: connect path directly to position on one or more lights and add it to L */ -ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RNG *rng, - ShaderData *sd, ShaderData *emission_sd, PathState *state, float3 throughput, - float num_samples_adjust, PathRadiance *L, int sample_all_lights) +ccl_device_noinline void kernel_branched_path_surface_connect_light( + KernelGlobals *kg, + RNG *rng, + ShaderData *sd, + ShaderData *emission_sd, + ccl_addr_space PathState *state, + float3 throughput, + float num_samples_adjust, + PathRadiance *L, + int sample_all_lights) { #ifdef __EMISSION__ /* sample illumination from lights to find path contribution */ - if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)) + if(!(sd->flag & SD_BSDF_HAS_EVAL)) return; Ray light_ray; @@ -33,7 +39,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal bool is_lamp; # ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); + light_ray.time = sd->time; # endif if(sample_all_lights) { @@ -52,7 +58,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples); LightSample ls; - if(lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls)) { + if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) { /* The sampling probability returned by lamp_light_sample assumes that all lights were sampled. * However, this code only samples lamps, so if the scene also had mesh lights, the real probability is twice as high. */ if(kernel_data.integrator.pdf_triangles != 0.0f) @@ -66,6 +72,9 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal /* accumulate */ path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); } + else { + path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light); + } } } } @@ -87,7 +96,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal light_t = 0.5f*light_t; LightSample ls; - if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) { + if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { /* Same as above, probability needs to be corrected since the sampling was forced to select a mesh light. */ if(kernel_data.integrator.num_all_lights) ls.pdf *= 2.0f; @@ -100,6 +109,9 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal /* accumulate */ path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); } + else { + path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light); + } } } } @@ -113,7 +125,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal float terminate = path_state_rng_light_termination(kg, rng, state); LightSample ls; - if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) { + if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { /* sample random light */ if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ @@ -123,6 +135,9 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal /* accumulate */ path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp); } + else { + path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light); + } } } } @@ -130,9 +145,17 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal } /* branched path tracing: bounce off or through surface to with new direction stored in ray */ -ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, - ShaderData *sd, const ShaderClosure *sc, int sample, int num_samples, - float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) +ccl_device bool kernel_branched_path_surface_bounce( + KernelGlobals *kg, + RNG *rng, + ShaderData *sd, + const ShaderClosure *sc, + int sample, + int num_samples, + ccl_addr_space float3 *throughput, + ccl_addr_space PathState *state, + PathRadiance *L, + Ray *ray) { /* sample BSDF */ float bsdf_pdf; @@ -156,15 +179,15 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, path_state_next(kg, state, label); /* setup ray */ - ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); + ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); ray->D = normalize(bsdf_omega_in); ray->t = FLT_MAX; #ifdef __RAY_DIFFERENTIALS__ - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; ray->dD = bsdf_domega_in; #endif #ifdef __OBJECT_MOTION__ - ray->time = ccl_fetch(sd, time); + ray->time = sd->time; #endif #ifdef __VOLUME__ @@ -188,15 +211,29 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, #endif -#ifndef __SPLIT_KERNEL__ /* path tracing: connect path directly to position on a light and add it to L */ -ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_addr_space RNG *rng, +ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng, ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state, PathRadiance *L) { #ifdef __EMISSION__ - if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) + if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL))) + return; + +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + kernel_branched_path_surface_connect_light(kg, + rng, + sd, + emission_sd, + state, + throughput, + 1.0f, + L, + 1); return; + } +#endif /* sample illumination from lights to find path contribution */ float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); @@ -208,11 +245,11 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_ bool is_lamp; #ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); + light_ray.time = sd->time; #endif LightSample ls; - if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) { + if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { float terminate = path_state_rng_light_termination(kg, rng, state); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ @@ -222,15 +259,17 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_ /* accumulate */ path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); } + else { + path_radiance_accum_total_light(L, throughput, &L_light); + } } } #endif } -#endif /* path tracing: bounce off or through surface to with new direction stored in ray */ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, - ccl_addr_space RNG *rng, + RNG *rng, ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, @@ -238,7 +277,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, ccl_addr_space Ray *ray) { /* no BSDF? we can stop here */ - if(ccl_fetch(sd, flag) & SD_BSDF) { + if(sd->flag & SD_BSDF) { /* sample BSDF */ float bsdf_pdf; BsdfEval bsdf_eval; @@ -270,16 +309,16 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, path_state_next(kg, state, label); /* setup ray */ - ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); + ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); ray->D = normalize(bsdf_omega_in); if(state->bounce == 0) - ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */ + ray->t -= sd->ray_length; /* clipping works through transparent */ else ray->t = FLT_MAX; #ifdef __RAY_DIFFERENTIALS__ - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; ray->dD = bsdf_domega_in; #endif @@ -291,21 +330,21 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, return true; } #ifdef __VOLUME__ - else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) { + else if(sd->flag & SD_HAS_ONLY_VOLUME) { /* no surface shader but have a volume shader? act transparent */ /* update path state, count as transparent */ path_state_next(kg, state, LABEL_TRANSPARENT); if(state->bounce == 0) - ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */ + ray->t -= sd->ray_length; /* clipping works through transparent */ else ray->t = FLT_MAX; /* setup ray position, direction stays unchanged */ - ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng)); + ray->P = ray_offset(sd->P, -sd->Ng); #ifdef __RAY_DIFFERENTIALS__ - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; #endif /* enter/exit volume */ diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h index 3d3b7385d8b..371f2c1c7cb 100644 --- a/intern/cycles/kernel/kernel_path_volume.h +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -24,7 +24,7 @@ ccl_device_inline void kernel_path_volume_connect_light( ShaderData *sd, ShaderData *emission_sd, float3 throughput, - PathState *state, + ccl_addr_space PathState *state, PathRadiance *L) { #ifdef __EMISSION__ @@ -59,7 +59,7 @@ ccl_device_inline void kernel_path_volume_connect_light( } } } -#endif +#endif /* __EMISSION__ */ } #ifdef __KERNEL_GPU__ @@ -67,8 +67,14 @@ ccl_device_noinline #else ccl_device #endif -bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng, - ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) +bool kernel_path_volume_bounce( + KernelGlobals *kg, + RNG *rng, + ShaderData *sd, + ccl_addr_space float3 *throughput, + ccl_addr_space PathState *state, + PathRadiance *L, + ccl_addr_space Ray *ray) { /* sample phase function */ float phase_pdf; @@ -111,9 +117,18 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng, return true; } -ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng, - ShaderData *sd, ShaderData *emission_sd, float3 throughput, PathState *state, PathRadiance *L, - bool sample_all_lights, Ray *ray, const VolumeSegment *segment) +#ifndef __SPLIT_KERNEL__ +ccl_device void kernel_branched_path_volume_connect_light( + KernelGlobals *kg, + RNG *rng, + ShaderData *sd, + ShaderData *emission_sd, + float3 throughput, + ccl_addr_space PathState *state, + PathRadiance *L, + bool sample_all_lights, + Ray *ray, + const VolumeSegment *segment) { #ifdef __EMISSION__ if(!kernel_data.integrator.use_direct_light) @@ -261,10 +276,11 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG } } } -#endif +#endif /* __EMISSION__ */ } +#endif /* __SPLIT_KERNEL__ */ -#endif +#endif /* __VOLUME_SCATTER__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h index cf5614b8a86..96bc636d5ac 100644 --- a/intern/cycles/kernel/kernel_queues.h +++ b/intern/cycles/kernel/kernel_queues.h @@ -17,12 +17,15 @@ #ifndef __KERNEL_QUEUE_H__ #define __KERNEL_QUEUE_H__ +CCL_NAMESPACE_BEGIN + /* * Queue utility functions for split kernel */ - +#ifdef __KERNEL_OPENCL__ #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#endif /* * Enqueue ray index into the queue @@ -35,7 +38,8 @@ ccl_device void enqueue_ray_index( ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */ { /* This thread's queue index. */ - int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size); + int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint*)&queue_index[queue_number]) + + (queue_number * queue_size); queues[my_queue_index] = ray_index; } @@ -47,6 +51,7 @@ ccl_device void enqueue_ray_index( * is no more ray to allocate to other threads. */ ccl_device int get_ray_index( + KernelGlobals *kg, int thread_index, /* Global thread index. */ int queue_number, /* Queue to operate on. */ ccl_global int *queues, /* Buffer of all queues. */ @@ -68,24 +73,25 @@ ccl_device void enqueue_ray_index_local( int queue_number, /* Queue in which to enqueue ray index. */ char enqueue_flag, /* True for threads whose ray index has to be enqueued. */ int queuesize, /* queue size. */ - ccl_local unsigned int *local_queue_atomics, /* To to local queue atomics. */ + ccl_local_param unsigned int *local_queue_atomics, /* To to local queue atomics. */ ccl_global int *Queue_data, /* Queues. */ ccl_global int *Queue_index) /* To do global queue atomics. */ { - int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0); + int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); /* Get local queue id .*/ unsigned int lqidx; if(enqueue_flag) { - lqidx = atomic_inc(local_queue_atomics); + lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics); } - barrier(CLK_LOCAL_MEM_FENCE); + ccl_barrier(CCL_LOCAL_MEM_FENCE); /* Get global queue offset. */ if(lidx == 0) { - *local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics); + *local_queue_atomics = atomic_fetch_and_add_uint32((ccl_global uint*)&Queue_index[queue_number], + *local_queue_atomics); } - barrier(CLK_LOCAL_MEM_FENCE); + ccl_barrier(CCL_LOCAL_MEM_FENCE); /* Get global queue index and enqueue ray. */ if(enqueue_flag) { @@ -96,19 +102,19 @@ ccl_device void enqueue_ray_index_local( ccl_device unsigned int get_local_queue_index( int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */ - ccl_local unsigned int *local_queue_atomics) + ccl_local_param unsigned int *local_queue_atomics) { - int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]); + int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]); return my_lqidx; } ccl_device unsigned int get_global_per_queue_offset( int queue_number, - ccl_local unsigned int *local_queue_atomics, + ccl_local_param unsigned int *local_queue_atomics, ccl_global int* global_queue_atomics) { - unsigned int queue_offset = atomic_add(&global_queue_atomics[queue_number], - local_queue_atomics[queue_number]); + unsigned int queue_offset = atomic_fetch_and_add_uint32((ccl_global uint*)&global_queue_atomics[queue_number], + local_queue_atomics[queue_number]); return queue_offset; } @@ -116,10 +122,12 @@ ccl_device unsigned int get_global_queue_index( int queue_number, int queuesize, unsigned int lqidx, - ccl_local unsigned int * global_per_queue_offset) + ccl_local_param unsigned int * global_per_queue_offset) { int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number]; return my_gqidx; } +CCL_NAMESPACE_END + #endif // __KERNEL_QUEUE_H__ diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index e773753396f..d4f0caff5de 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "kernel_jitter.h" +#include "kernel/kernel_jitter.h" CCL_NAMESPACE_BEGIN @@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons return index; } -ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension) +ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension) { #ifdef __CMJ__ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { @@ -130,7 +130,7 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG * #endif } -ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) +ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) { #ifdef __CMJ__ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { @@ -147,7 +147,7 @@ ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *r } } -ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, ccl_addr_space RNG *rng, int x, int y, float *fx, float *fy) +ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy) { #ifdef __SOBOL_FULL_SCREEN__ uint px, py; @@ -191,14 +191,14 @@ ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG /* Linear Congruential Generator */ -ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension) +ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension) { /* implicit mod 2^32 */ - rng = (1103515245*(rng) + 12345); - return (float)rng * (1.0f/(float)0xFFFFFFFF); + *rng = (1103515245*(*rng) + 12345); + return (float)*rng * (1.0f/(float)0xFFFFFFFF); } -ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension, float *fx, float *fy) +ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) { *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); @@ -259,12 +259,12 @@ ccl_device uint lcg_init(uint seed) * For branches in the path we must be careful not to reuse the same number * in a sequence and offset accordingly. */ -ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension) +ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension) { return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension); } -ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension) +ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension) { /* the rng_offset is not increased for transparent bounces. if we do then * fully transparent objects can become subtly visible by the different @@ -277,29 +277,29 @@ ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_ad return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension); } -ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy) +ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy) { path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy); } -ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension) { return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension); } -ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension) { int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM; return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension); } -ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy) +ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy) { path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy); } /* Utitility functions to get light termination value, since it might not be needed in many cases. */ -ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state) +ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE); @@ -307,7 +307,7 @@ ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, ccl_ return 0.0f; } -ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches) +ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { return path_branched_rng_1D_for_decision(kg, rng, state, branch, num_branches, PRNG_LIGHT_TERMINATE); @@ -315,7 +315,7 @@ ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, c return 0.0f; } -ccl_device_inline void path_state_branch(PathState *state, int branch, int num_branches) +ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, int branch, int num_branches) { /* path is splitting into a branch, adjust so that each branch * still gets a unique sample from the same sequence */ @@ -324,18 +324,9 @@ ccl_device_inline void path_state_branch(PathState *state, int branch, int num_b state->num_samples = state->num_samples*num_branches; } -ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble) -{ - return lcg_init(*rng + state->rng_offset + state->sample*scramble); -} - -/* TODO(sergey): For until we can use generic address space from OpenCL 2.0. */ - -ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space RNG *rng, - const ccl_addr_space PathState *state, - uint scramble) +ccl_device_inline uint lcg_state_init(RNG *rng, int rng_offset, int sample, uint scramble) { - return lcg_init(*rng + state->rng_offset + state->sample*scramble); + return lcg_init(*rng + rng_offset + sample*scramble); } ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng) diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index d0826e5e879..8c0c5e90a3e 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -24,12 +24,12 @@ * */ -#include "closure/alloc.h" -#include "closure/bsdf_util.h" -#include "closure/bsdf.h" -#include "closure/emissive.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_util.h" +#include "kernel/closure/bsdf.h" +#include "kernel/closure/emissive.h" -#include "svm/svm.h" +#include "kernel/svm/svm.h" CCL_NAMESPACE_BEGIN @@ -38,13 +38,13 @@ CCL_NAMESPACE_BEGIN #ifdef __OBJECT_MOTION__ ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time) { - if(ccl_fetch(sd, object_flag) & SD_OBJECT_MOTION) { - ccl_fetch(sd, ob_tfm) = object_fetch_transform_motion(kg, ccl_fetch(sd, object), time); - ccl_fetch(sd, ob_itfm) = transform_quick_inverse(ccl_fetch(sd, ob_tfm)); + if(sd->object_flag & SD_OBJECT_MOTION) { + sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time); + sd->ob_itfm = transform_quick_inverse(sd->ob_tfm); } else { - ccl_fetch(sd, ob_tfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); - ccl_fetch(sd, ob_itfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); } } #endif @@ -55,55 +55,55 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, const Ray *ray) { #ifdef __INSTANCING__ - ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object; + sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object; #endif - ccl_fetch(sd, type) = isect->type; - ccl_fetch(sd, flag) = 0; - ccl_fetch(sd, object_flag) = kernel_tex_fetch(__object_flag, - ccl_fetch(sd, object)); + sd->type = isect->type; + sd->flag = 0; + sd->object_flag = kernel_tex_fetch(__object_flag, + sd->object); /* matrices and time */ #ifdef __OBJECT_MOTION__ shader_setup_object_transforms(kg, sd, ray->time); - ccl_fetch(sd, time) = ray->time; + sd->time = ray->time; #endif - ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim); - ccl_fetch(sd, ray_length) = isect->t; + sd->prim = kernel_tex_fetch(__prim_index, isect->prim); + sd->ray_length = isect->t; #ifdef __UV__ - ccl_fetch(sd, u) = isect->u; - ccl_fetch(sd, v) = isect->v; + sd->u = isect->u; + sd->v = isect->v; #endif #ifdef __HAIR__ - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + if(sd->type & PRIMITIVE_ALL_CURVE) { /* curve */ - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); - ccl_fetch(sd, shader) = __float_as_int(curvedata.z); - ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray); + sd->shader = __float_as_int(curvedata.z); + sd->P = bvh_curve_refine(kg, sd, isect, ray); } else #endif - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { + if(sd->type & PRIMITIVE_TRIANGLE) { /* static triangle */ float3 Ng = triangle_normal(kg, sd); - ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim)); + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); /* vectors */ - ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray); - ccl_fetch(sd, Ng) = Ng; - ccl_fetch(sd, N) = Ng; + sd->P = triangle_refine(kg, sd, isect, ray); + sd->Ng = Ng; + sd->N = Ng; /* smooth normal */ - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) - ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v)); + if(sd->shader & SHADER_SMOOTH_NORMAL) + sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); #ifdef __DPDU__ /* dPdu/dPdv */ - triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv)); + triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); #endif } else { @@ -111,40 +111,40 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, motion_triangle_shader_setup(kg, sd, isect, ray, false); } - ccl_fetch(sd, I) = -ray->D; + sd->I = -ray->D; - ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE); + sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE); #ifdef __INSTANCING__ if(isect->object != OBJECT_NONE) { /* instance transform */ - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N)); - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng)); + object_normal_transform_auto(kg, sd, &sd->N); + object_normal_transform_auto(kg, sd, &sd->Ng); # ifdef __DPDU__ - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu)); - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv)); + object_dir_transform_auto(kg, sd, &sd->dPdu); + object_dir_transform_auto(kg, sd, &sd->dPdv); # endif } #endif /* backfacing test */ - bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f); + bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); if(backfacing) { - ccl_fetch(sd, flag) |= SD_BACKFACING; - ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng); - ccl_fetch(sd, N) = -ccl_fetch(sd, N); + sd->flag |= SD_BACKFACING; + sd->Ng = -sd->Ng; + sd->N = -sd->N; #ifdef __DPDU__ - ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu); - ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv); + sd->dPdu = -sd->dPdu; + sd->dPdv = -sd->dPdv; #endif } #ifdef __RAY_DIFFERENTIALS__ /* differentials */ - differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t); - differential_incoming(&ccl_fetch(sd, dI), ray->dD); - differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng)); + differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t); + differential_incoming(&sd->dI, ray->dD); + differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng); #endif } @@ -203,11 +203,11 @@ void shader_setup_from_subsurface( # ifdef __INSTANCING__ if(isect->object != OBJECT_NONE) { /* instance transform */ - object_normal_transform(kg, sd, &sd->N); - object_normal_transform(kg, sd, &sd->Ng); + object_normal_transform_auto(kg, sd, &sd->N); + object_normal_transform_auto(kg, sd, &sd->Ng); # ifdef __DPDU__ - object_dir_transform(kg, sd, &sd->dPdu); - object_dir_transform(kg, sd, &sd->dPdv); + object_dir_transform_auto(kg, sd, &sd->dPdu); + object_dir_transform_auto(kg, sd, &sd->dPdv); # endif } # endif @@ -249,106 +249,106 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, int lamp) { /* vectors */ - ccl_fetch(sd, P) = P; - ccl_fetch(sd, N) = Ng; - ccl_fetch(sd, Ng) = Ng; - ccl_fetch(sd, I) = I; - ccl_fetch(sd, shader) = shader; + sd->P = P; + sd->N = Ng; + sd->Ng = Ng; + sd->I = I; + sd->shader = shader; if(prim != PRIM_NONE) - ccl_fetch(sd, type) = PRIMITIVE_TRIANGLE; + sd->type = PRIMITIVE_TRIANGLE; else if(lamp != LAMP_NONE) - ccl_fetch(sd, type) = PRIMITIVE_LAMP; + sd->type = PRIMITIVE_LAMP; else - ccl_fetch(sd, type) = PRIMITIVE_NONE; + sd->type = PRIMITIVE_NONE; /* primitive */ #ifdef __INSTANCING__ - ccl_fetch(sd, object) = object; + sd->object = object; #endif /* currently no access to bvh prim index for strand sd->prim*/ - ccl_fetch(sd, prim) = prim; + sd->prim = prim; #ifdef __UV__ - ccl_fetch(sd, u) = u; - ccl_fetch(sd, v) = v; + sd->u = u; + sd->v = v; #endif - ccl_fetch(sd, ray_length) = t; + sd->ray_length = t; - ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE); - ccl_fetch(sd, object_flag) = 0; - if(ccl_fetch(sd, object) != OBJECT_NONE) { - ccl_fetch(sd, object_flag) |= kernel_tex_fetch(__object_flag, - ccl_fetch(sd, object)); + sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE); + sd->object_flag = 0; + if(sd->object != OBJECT_NONE) { + sd->object_flag |= kernel_tex_fetch(__object_flag, + sd->object); #ifdef __OBJECT_MOTION__ shader_setup_object_transforms(kg, sd, time); - ccl_fetch(sd, time) = time; + sd->time = time; } else if(lamp != LAMP_NONE) { - ccl_fetch(sd, ob_tfm) = lamp_fetch_transform(kg, lamp, false); - ccl_fetch(sd, ob_itfm) = lamp_fetch_transform(kg, lamp, true); + sd->ob_tfm = lamp_fetch_transform(kg, lamp, false); + sd->ob_itfm = lamp_fetch_transform(kg, lamp, true); #endif } /* transform into world space */ if(object_space) { - object_position_transform_auto(kg, sd, &ccl_fetch(sd, P)); - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng)); - ccl_fetch(sd, N) = ccl_fetch(sd, Ng); - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, I)); + object_position_transform_auto(kg, sd, &sd->P); + object_normal_transform_auto(kg, sd, &sd->Ng); + sd->N = sd->Ng; + object_dir_transform_auto(kg, sd, &sd->I); } - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { + if(sd->type & PRIMITIVE_TRIANGLE) { /* smooth normal */ - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { - ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v)); + if(sd->shader & SHADER_SMOOTH_NORMAL) { + sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); #ifdef __INSTANCING__ - if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) { - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N)); + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + object_normal_transform_auto(kg, sd, &sd->N); } #endif } /* dPdu/dPdv */ #ifdef __DPDU__ - triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv)); + triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); # ifdef __INSTANCING__ - if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) { - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu)); - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv)); + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + object_dir_transform_auto(kg, sd, &sd->dPdu); + object_dir_transform_auto(kg, sd, &sd->dPdv); } # endif #endif } else { #ifdef __DPDU__ - ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f); - ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdu = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdv = make_float3(0.0f, 0.0f, 0.0f); #endif } /* backfacing test */ - if(ccl_fetch(sd, prim) != PRIM_NONE) { - bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f); + if(sd->prim != PRIM_NONE) { + bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); if(backfacing) { - ccl_fetch(sd, flag) |= SD_BACKFACING; - ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng); - ccl_fetch(sd, N) = -ccl_fetch(sd, N); + sd->flag |= SD_BACKFACING; + sd->Ng = -sd->Ng; + sd->N = -sd->N; #ifdef __DPDU__ - ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu); - ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv); + sd->dPdu = -sd->dPdu; + sd->dPdv = -sd->dPdv; #endif } } #ifdef __RAY_DIFFERENTIALS__ /* no ray differentials here yet */ - ccl_fetch(sd, dP) = differential3_zero(); - ccl_fetch(sd, dI) = differential3_zero(); - ccl_fetch(sd, du) = differential_zero(); - ccl_fetch(sd, dv) = differential_zero(); + sd->dP = differential3_zero(); + sd->dI = differential3_zero(); + sd->du = differential_zero(); + sd->dv = differential_zero(); #endif } @@ -378,39 +378,39 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd, ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray) { /* vectors */ - ccl_fetch(sd, P) = ray->D; - ccl_fetch(sd, N) = -ray->D; - ccl_fetch(sd, Ng) = -ray->D; - ccl_fetch(sd, I) = -ray->D; - ccl_fetch(sd, shader) = kernel_data.background.surface_shader; - ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE); - ccl_fetch(sd, object_flag) = 0; + sd->P = ray->D; + sd->N = -ray->D; + sd->Ng = -ray->D; + sd->I = -ray->D; + sd->shader = kernel_data.background.surface_shader; + sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE); + sd->object_flag = 0; #ifdef __OBJECT_MOTION__ - ccl_fetch(sd, time) = ray->time; + sd->time = ray->time; #endif - ccl_fetch(sd, ray_length) = 0.0f; + sd->ray_length = 0.0f; #ifdef __INSTANCING__ - ccl_fetch(sd, object) = PRIM_NONE; + sd->object = PRIM_NONE; #endif - ccl_fetch(sd, prim) = PRIM_NONE; + sd->prim = PRIM_NONE; #ifdef __UV__ - ccl_fetch(sd, u) = 0.0f; - ccl_fetch(sd, v) = 0.0f; + sd->u = 0.0f; + sd->v = 0.0f; #endif #ifdef __DPDU__ /* dPdu/dPdv */ - ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f); - ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdu = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdv = make_float3(0.0f, 0.0f, 0.0f); #endif #ifdef __RAY_DIFFERENTIALS__ /* differentials */ - ccl_fetch(sd, dP) = ray->dD; - differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP)); - ccl_fetch(sd, du) = differential_zero(); - ccl_fetch(sd, dv) = differential_zero(); + sd->dP = ray->dD; + differential_incoming(&sd->dI, sd->dP); + sd->du = differential_zero(); + sd->dv = differential_zero(); #endif } @@ -505,18 +505,18 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, ShaderData *sd { /* this is the veach one-sample model with balance heuristic, some pdf * factors drop out when using balance heuristic weighting */ - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + for(int i = 0; i < sd->num_closure; i++) { if(i == skip_bsdf) continue; - const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + const ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF(sc->type)) { float bsdf_pdf = 0.0f; float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf); if(bsdf_pdf != 0.0f) { - bsdf_eval_accum(result_eval, sc->type, eval*sc->weight); + bsdf_eval_accum(result_eval, sc->type, eval*sc->weight, 1.0f); sum_pdf += bsdf_pdf*sc->sample_weight; } @@ -535,8 +535,8 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg, float light_pdf, bool use_mis) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF(sc->type)) { float bsdf_pdf = 0.0f; float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf); @@ -544,7 +544,8 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg, float mis_weight = use_mis? power_heuristic(light_pdf, bsdf_pdf): 1.0f; bsdf_eval_accum(result_eval, sc->type, - eval * sc->weight * mis_weight); + eval * sc->weight, + mis_weight); } } } @@ -576,7 +577,7 @@ void shader_bsdf_eval(KernelGlobals *kg, _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, -1, eval, 0.0f, 0.0f); if(use_mis) { float weight = power_heuristic(light_pdf, pdf); - bsdf_eval_mul(eval, weight); + bsdf_eval_mis(eval, weight); } } } @@ -591,22 +592,22 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, { int sampled = 0; - if(ccl_fetch(sd, num_closure) > 1) { + if(sd->num_closure > 1) { /* pick a BSDF closure based on sample weights */ float sum = 0.0f; - for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); + for(sampled = 0; sampled < sd->num_closure; sampled++) { + const ShaderClosure *sc = &sd->closure[sampled]; if(CLOSURE_IS_BSDF(sc->type)) sum += sc->sample_weight; } - float r = ccl_fetch(sd, randb_closure)*sum; + float r = sd->randb_closure*sum; sum = 0.0f; - for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); + for(sampled = 0; sampled < sd->num_closure; sampled++) { + const ShaderClosure *sc = &sd->closure[sampled]; if(CLOSURE_IS_BSDF(sc->type)) { sum += sc->sample_weight; @@ -616,13 +617,13 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, } } - if(sampled == ccl_fetch(sd, num_closure)) { + if(sampled == sd->num_closure) { *pdf = 0.0f; return LABEL_NONE; } } - const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); + const ShaderClosure *sc = &sd->closure[sampled]; int label; float3 eval; @@ -633,7 +634,7 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, if(*pdf != 0.0f) { bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass); - if(ccl_fetch(sd, num_closure) > 1) { + if(sd->num_closure > 1) { float sweight = sc->sample_weight; _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight); } @@ -660,8 +661,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, ShaderData *sd, ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF(sc->type)) bsdf_blur(kg, sc, roughness); @@ -670,13 +671,13 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd) { - if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) + if(sd->flag & SD_HAS_ONLY_VOLUME) return make_float3(1.0f, 1.0f, 1.0f); float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl eval += sc->weight; @@ -685,6 +686,18 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd) return eval; } +ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd) +{ + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + + if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) { + sc->sample_weight = 0.0f; + sc->weight = make_float3(0.0f, 0.0f, 0.0f); + } + } +} + ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd) { float3 alpha = make_float3(1.0f, 1.0f, 1.0f) - shader_bsdf_transparency(kg, sd); @@ -699,8 +712,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) eval += sc->weight; @@ -713,8 +726,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_GLOSSY(sc->type)) eval += sc->weight; @@ -727,8 +740,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type)) eval += sc->weight; @@ -741,8 +754,8 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type)) eval += sc->weight; @@ -756,8 +769,8 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac float3 eval = make_float3(0.0f, 0.0f, 0.0f); float3 N = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) { const DiffuseBsdf *bsdf = (const DiffuseBsdf*)sc; @@ -766,12 +779,12 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac } else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) { eval += sc->weight; - N += ccl_fetch(sd, N)*average(sc->weight); + N += sd->N*average(sc->weight); } } if(is_zero(N)) - N = ccl_fetch(sd, N); + N = sd->N; else N = normalize(N); @@ -786,8 +799,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b float3 N = make_float3(0.0f, 0.0f, 0.0f); float texture_blur = 0.0f, weight_sum = 0.0f; - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSSRDF(sc->type)) { const Bssrdf *bssrdf = (const Bssrdf*)sc; @@ -801,10 +814,10 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b } if(N_) - *N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N); + *N_ = (is_zero(N))? sd->N: normalize(N); if(texture_blur_) - *texture_blur_ = texture_blur/weight_sum; + *texture_blur_ = safe_divide(texture_blur, weight_sum); return eval; } @@ -814,7 +827,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc) { - return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I)); + return emissive_simple_eval(sd->Ng, sd->I); } ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd) @@ -822,8 +835,8 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd) float3 eval; eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_EMISSION(sc->type)) eval += emissive_eval(kg, sd, sc)*sc->weight; @@ -838,8 +851,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) { float3 weight = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_HOLDOUT(sc->type)) weight += sc->weight; @@ -850,12 +863,12 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) /* Surface Evaluation */ -ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_space RNG *rng, +ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, RNG *rng, ccl_addr_space PathState *state, float randb, int path_flag, ShaderContext ctx) { - ccl_fetch(sd, num_closure) = 0; - ccl_fetch(sd, num_closure_extra) = 0; - ccl_fetch(sd, randb_closure) = randb; + sd->num_closure = 0; + sd->num_closure_extra = 0; + sd->randb_closure = randb; #ifdef __OSL__ if(kg->osl) @@ -869,13 +882,13 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_ DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), make_float3(0.8f, 0.8f, 0.8f)); - bsdf->N = ccl_fetch(sd, N); - ccl_fetch(sd, flag) |= bsdf_diffuse_setup(bsdf); + bsdf->N = sd->N; + sd->flag |= bsdf_diffuse_setup(bsdf); #endif } - if(rng && (ccl_fetch(sd, flag) & SD_BSDF_NEEDS_LCG)) { - ccl_fetch(sd, lcg_state) = lcg_state_init_addrspace(rng, state, 0xb4bc3953); + if(rng && (sd->flag & SD_BSDF_NEEDS_LCG)) { + sd->lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0xb4bc3953); } } @@ -884,9 +897,9 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, int path_flag, ShaderContext ctx) { - ccl_fetch(sd, num_closure) = 0; - ccl_fetch(sd, num_closure_extra) = 0; - ccl_fetch(sd, randb_closure) = 0.0f; + sd->num_closure = 0; + sd->num_closure_extra = 0; + sd->randb_closure = 0.0f; #ifdef __SVM__ #ifdef __OSL__ @@ -901,8 +914,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BACKGROUND(sc->type)) eval += sc->weight; @@ -932,7 +945,7 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con float3 eval = volume_phase_eval(sd, sc, omega_in, &phase_pdf); if(phase_pdf != 0.0f) { - bsdf_eval_accum(result_eval, sc->type, eval); + bsdf_eval_accum(result_eval, sc->type, eval, 1.0f); sum_pdf += phase_pdf*sc->sample_weight; } @@ -1024,8 +1037,8 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData * ccl_device_inline void shader_eval_volume(KernelGlobals *kg, ShaderData *sd, - PathState *state, - VolumeStack *stack, + ccl_addr_space PathState *state, + ccl_addr_space VolumeStack *stack, int path_flag, ShaderContext ctx) { @@ -1081,9 +1094,9 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderContext ctx) { - ccl_fetch(sd, num_closure) = 0; - ccl_fetch(sd, num_closure_extra) = 0; - ccl_fetch(sd, randb_closure) = 0.0f; + sd->num_closure = 0; + sd->num_closure_extra = 0; + sd->randb_closure = 0.0f; /* this will modify sd->P */ #ifdef __SVM__ diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index 2981f6ac566..0426e0a62c9 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -16,9 +16,84 @@ CCL_NAMESPACE_BEGIN -#ifdef __SHADOW_RECORD_ALL__ +/* Attenuate throughput accordingly to the given intersection event. + * Returns true if the throughput is zero and traversal can be aborted. + */ +ccl_device_forceinline bool shadow_handle_transparent_isect( + KernelGlobals *kg, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, +# ifdef __VOLUME__ + ccl_addr_space struct PathState *volume_state, +# endif + Intersection *isect, + Ray *ray, + float3 *throughput) +{ +#ifdef __VOLUME__ + /* Attenuation between last surface and next surface. */ + if(volume_state->volume_stack[0].shader != SHADER_NONE) { + Ray segment_ray = *ray; + segment_ray.t = isect->t; + kernel_volume_shadow(kg, + shadow_sd, + volume_state, + &segment_ray, + throughput); + } +#endif + /* Setup shader data at surface. */ + shader_setup_from_ray(kg, shadow_sd, isect, ray); + /* Attenuation from transparent surface. */ + if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) { + path_state_modify_bounce(state, true); + shader_eval_surface(kg, + shadow_sd, + NULL, + state, + 0.0f, + PATH_RAY_SHADOW, + SHADER_CONTEXT_SHADOW); + path_state_modify_bounce(state, false); + *throughput *= shader_bsdf_transparency(kg, shadow_sd); + } + /* Stop if all light is blocked. */ + if(is_zero(*throughput)) { + return true; + } +#ifdef __VOLUME__ + /* Exit/enter volume. */ + kernel_volume_stack_enter_exit(kg, shadow_sd, volume_state->volume_stack); +#endif + return false; +} + +/* Special version which only handles opaque shadows. */ +ccl_device bool shadow_blocked_opaque(KernelGlobals *kg, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + Ray *ray, + Intersection *isect, + float3 *shadow) +{ + const bool blocked = scene_intersect(kg, + *ray, + PATH_RAY_SHADOW_OPAQUE, + isect, + NULL, + 0.0f, 0.0f); +#ifdef __VOLUME__ + if(!blocked && state->volume_stack[0].shader != SHADER_NONE) { + /* Apply attenuation from current volume shader. */ + kernel_volume_shadow(kg, shadow_sd, state, ray, shadow); + } +#endif + return blocked; +} -/* Shadow function to compute how much light is blocked, CPU variation. +#ifdef __TRANSPARENT_SHADOWS__ +# ifdef __SHADOW_RECORD_ALL__ +/* Shadow function to compute how much light is blocked, * * We trace a single ray. If it hits any opaque surface, or more than a given * number of transparent surfaces is hit, then we consider the geometry to be @@ -36,261 +111,403 @@ CCL_NAMESPACE_BEGIN * or there is a performance increase anyway due to avoiding the need to send * two rays with transparent shadows. * - * This is CPU only because of qsort, and malloc or high stack space usage to - * record all these intersections. */ + * On CPU it'll handle all transparent bounces (by allocating storage for + * intersections when they don't fit into the stack storage). + * + * On GPU it'll only handle SHADOW_STACK_MAX_HITS-1 intersections, so this + * is something to be kept an eye on. + */ -#define STACK_MAX_HITS 64 +# define SHADOW_STACK_MAX_HITS 64 -ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *shadow) +/* Actual logic with traversal loop implementation which is free from device + * specific tweaks. + * + * Note that hits array should be as big as max_hits+1. + */ +ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + const int skip_object, + Ray *ray, + Intersection *hits, + uint max_hits, + float3 *shadow) { - *shadow = make_float3(1.0f, 1.0f, 1.0f); - - if(ray->t == 0.0f) - return false; - - bool blocked; - - if(kernel_data.integrator.transparent_shadows) { - /* check transparent bounces here, for volume scatter which can do - * lighting before surface path termination is checked */ - if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) - return true; - - /* intersect to find an opaque surface, or record all transparent surface hits */ - Intersection hits_stack[STACK_MAX_HITS]; - Intersection *hits = hits_stack; - const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce; - uint max_hits = transparent_max_bounce - state->transparent_bounce - 1; - - /* prefer to use stack but use dynamic allocation if too deep max hits - * we need max_hits + 1 storage space due to the logic in - * scene_intersect_shadow_all which will first store and then check if - * the limit is exceeded */ - if(max_hits + 1 > STACK_MAX_HITS) { - if(kg->transparent_shadow_intersections == NULL) { - kg->transparent_shadow_intersections = - (Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1)); + /* Intersect to find an opaque surface, or record all transparent + * surface hits. + */ + uint num_hits; + const bool blocked = scene_intersect_shadow_all(kg, + ray, + hits, + skip_object, + max_hits, + &num_hits); + /* If no opaque surface found but we did find transparent hits, + * shade them. + */ + if(!blocked && num_hits > 0) { + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + float3 Pend = ray->P + ray->D*ray->t; + float last_t = 0.0f; + int bounce = state->transparent_bounce; + Intersection *isect = hits; +# ifdef __VOLUME__ +# ifdef __SPLIT_KERNEL__ + ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)]; +# else + PathState ps_object; + PathState *ps = &ps_object; +# endif + *ps = *state; +# endif + sort_intersections(hits, num_hits); + for(int hit = 0; hit < num_hits; hit++, isect++) { + /* Adjust intersection distance for moving ray forward. */ + float new_t = isect->t; + isect->t -= last_t; + /* Skip hit if we did not move forward, step by step raytracing + * would have skipped it as well then. + */ + if(last_t == new_t) { + continue; } - hits = kg->transparent_shadow_intersections; - } - - uint num_hits; - blocked = scene_intersect_shadow_all(kg, ray, hits, max_hits, &num_hits); - - /* if no opaque surface found but we did find transparent hits, shade them */ - if(!blocked && num_hits > 0) { - float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - float3 Pend = ray->P + ray->D*ray->t; - float last_t = 0.0f; - int bounce = state->transparent_bounce; - Intersection *isect = hits; + last_t = new_t; + /* Attenuate the throughput. */ + if(shadow_handle_transparent_isect(kg, + shadow_sd, + state, #ifdef __VOLUME__ - PathState ps = *state; + ps, #endif - - qsort(hits, num_hits, sizeof(Intersection), intersections_compare); - - for(int hit = 0; hit < num_hits; hit++, isect++) { - /* adjust intersection distance for moving ray forward */ - float new_t = isect->t; - isect->t -= last_t; - - /* skip hit if we did not move forward, step by step raytracing - * would have skipped it as well then */ - if(last_t == new_t) - continue; - - last_t = new_t; - -#ifdef __VOLUME__ - /* attenuation between last surface and next surface */ - if(ps.volume_stack[0].shader != SHADER_NONE) { - Ray segment_ray = *ray; - segment_ray.t = isect->t; - kernel_volume_shadow(kg, shadow_sd, &ps, &segment_ray, &throughput); - } -#endif - - /* setup shader data at surface */ - shader_setup_from_ray(kg, shadow_sd, isect, ray); - - /* attenuation from transparent surface */ - if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) { - path_state_modify_bounce(state, true); - shader_eval_surface(kg, shadow_sd, NULL, state, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW); - path_state_modify_bounce(state, false); - - throughput *= shader_bsdf_transparency(kg, shadow_sd); - } - - /* stop if all light is blocked */ - if(is_zero(throughput)) { - return true; - } - - /* move ray forward */ - ray->P = shadow_sd->P; - if(ray->t != FLT_MAX) { - ray->D = normalize_len(Pend - ray->P, &ray->t); - } - -#ifdef __VOLUME__ - /* exit/enter volume */ - kernel_volume_stack_enter_exit(kg, shadow_sd, ps.volume_stack); -#endif - - bounce++; + isect, + ray, + &throughput)) + { + return true; } - -#ifdef __VOLUME__ - /* attenuation for last line segment towards light */ - if(ps.volume_stack[0].shader != SHADER_NONE) - kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput); -#endif - - *shadow = throughput; - - return is_zero(throughput); + /* Move ray forward. */ + ray->P = shadow_sd->P; + if(ray->t != FLT_MAX) { + ray->D = normalize_len(Pend - ray->P, &ray->t); + } + bounce++; } +# ifdef __VOLUME__ + /* Attenuation for last line segment towards light. */ + if(ps->volume_stack[0].shader != SHADER_NONE) { + kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput); + } +# endif + *shadow = throughput; + return is_zero(throughput); } - else { - Intersection isect; - blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f); - } - -#ifdef __VOLUME__ +# ifdef __VOLUME__ if(!blocked && state->volume_stack[0].shader != SHADER_NONE) { - /* apply attenuation from current volume shader */ + /* Apply attenuation from current volume shader/ */ kernel_volume_shadow(kg, shadow_sd, state, ray, shadow); } -#endif - +# endif return blocked; } -#undef STACK_MAX_HITS - -#else +/* Here we do all device specific trickery before invoking actual traversal + * loop to help readability of the actual logic. + */ +ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + const int skip_object, + Ray *ray, + uint max_hits, + float3 *shadow) +{ +# ifdef __SPLIT_KERNEL__ + Intersection hits_[SHADOW_STACK_MAX_HITS]; + Intersection *hits = &hits_[0]; +# elif defined(__KERNEL_CUDA__) + Intersection *hits = kg->hits_stack; +# else + Intersection hits_stack[SHADOW_STACK_MAX_HITS]; + Intersection *hits = hits_stack; +# endif +# ifndef __KERNEL_GPU__ + /* Prefer to use stack but use dynamic allocation if too deep max hits + * we need max_hits + 1 storage space due to the logic in + * scene_intersect_shadow_all which will first store and then check if + * the limit is exceeded. + * + * Ignore this on GPU because of slow/unavailable malloc(). + */ + if(max_hits + 1 > SHADOW_STACK_MAX_HITS) { + if(kg->transparent_shadow_intersections == NULL) { + const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce; + kg->transparent_shadow_intersections = + (Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1)); + } + hits = kg->transparent_shadow_intersections; + } +# endif /* __KERNEL_GPU__ */ + /* Invoke actual traversal. */ + return shadow_blocked_transparent_all_loop(kg, + shadow_sd, + state, + skip_object, + ray, + hits, + max_hits, + shadow); +} +# endif /* __SHADOW_RECORD_ALL__ */ -/* Shadow function to compute how much light is blocked, GPU variation. +# if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__) +/* Shadow function to compute how much light is blocked, * * Here we raytrace from one transparent surface to the next step by step. * To minimize overhead in cases where we don't need transparent shadows, we * first trace a regular shadow ray. We check if the hit primitive was * potentially transparent, and only in that case start marching. this gives - * one extra ray cast for the cases were we do want transparency. */ + * one extra ray cast for the cases were we do want transparency. + */ -ccl_device_noinline bool shadow_blocked(KernelGlobals *kg, - ShaderData *shadow_sd, - ccl_addr_space PathState *state, - ccl_addr_space Ray *ray_input, - float3 *shadow) +/* This function is only implementing device-independent traversal logic + * which requires some precalculation done. + */ +ccl_device bool shadow_blocked_transparent_stepped_loop( + KernelGlobals *kg, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + const int skip_object, + Ray *ray, + Intersection *isect, + const bool blocked, + const bool is_transparent_isect, + float3 *shadow) { - *shadow = make_float3(1.0f, 1.0f, 1.0f); - - if(ray_input->t == 0.0f) - return false; - -#ifdef __SPLIT_KERNEL__ - Ray private_ray = *ray_input; - Ray *ray = &private_ray; -#else - Ray *ray = ray_input; -#endif - -#ifdef __SPLIT_KERNEL__ - Intersection *isect = &kg->isect_shadow[SD_THREAD]; -#else - Intersection isect_object; - Intersection *isect = &isect_object; -#endif - - bool blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f); - -#ifdef __TRANSPARENT_SHADOWS__ - if(blocked && kernel_data.integrator.transparent_shadows) { - if(shader_transparent_shadow(kg, isect)) { - float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - float3 Pend = ray->P + ray->D*ray->t; - int bounce = state->transparent_bounce; -#ifdef __VOLUME__ - PathState ps = *state; -#endif - - for(;;) { - if(bounce >= kernel_data.integrator.transparent_max_bounce) - return true; - - if(!scene_intersect(kg, *ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f)) - { -#ifdef __VOLUME__ - /* attenuation for last line segment towards light */ - if(ps.volume_stack[0].shader != SHADER_NONE) - kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput); -#endif - - *shadow *= throughput; - - return false; - } - - if(!shader_transparent_shadow(kg, isect)) { - return true; - } - -#ifdef __VOLUME__ - /* attenuation between last surface and next surface */ - if(ps.volume_stack[0].shader != SHADER_NONE) { - Ray segment_ray = *ray; - segment_ray.t = isect->t; - kernel_volume_shadow(kg, shadow_sd, &ps, &segment_ray, &throughput); + if((blocked && is_transparent_isect) || skip_object != OBJECT_NONE) { + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + float3 Pend = ray->P + ray->D*ray->t; + int bounce = state->transparent_bounce; +# ifdef __VOLUME__ +# ifdef __SPLIT_KERNEL__ + ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)]; +# else + PathState ps_object; + PathState *ps = &ps_object; +# endif + *ps = *state; +# endif + for(;;) { + if(bounce >= kernel_data.integrator.transparent_max_bounce) { + return true; + } + if(!scene_intersect(kg, + *ray, + PATH_RAY_SHADOW_TRANSPARENT, + isect, + NULL, + 0.0f, 0.0f)) + { + break; + } +#ifdef __SHADOW_TRICKS__ + if(skip_object != OBJECT_NONE) { + const int isect_object = (isect->object == PRIM_NONE) + ? kernel_tex_fetch(__prim_object, isect->prim) + : isect->object; + if(isect_object == skip_object) { + shader_setup_from_ray(kg, shadow_sd, isect, ray); + /* Move ray forward. */ + ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng); + if(ray->t != FLT_MAX) { + ray->D = normalize_len(Pend - ray->P, &ray->t); + } + bounce++; + continue; } + } #endif - - /* setup shader data at surface */ - shader_setup_from_ray(kg, shadow_sd, isect, ray); - - /* attenuation from transparent surface */ - if(!(ccl_fetch(shadow_sd, flag) & SD_HAS_ONLY_VOLUME)) { - path_state_modify_bounce(state, true); - shader_eval_surface(kg, shadow_sd, NULL, state, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW); - path_state_modify_bounce(state, false); - - throughput *= shader_bsdf_transparency(kg, shadow_sd); - } - - /* stop if all light is blocked */ - if(is_zero(throughput)) { - return true; - } - - /* move ray forward */ - ray->P = ray_offset(ccl_fetch(shadow_sd, P), -ccl_fetch(shadow_sd, Ng)); - if(ray->t != FLT_MAX) { - ray->D = normalize_len(Pend - ray->P, &ray->t); - } - + if(!shader_transparent_shadow(kg, isect)) { + return true; + } + /* Attenuate the throughput. */ + if(shadow_handle_transparent_isect(kg, + shadow_sd, + state, #ifdef __VOLUME__ - /* exit/enter volume */ - kernel_volume_stack_enter_exit(kg, shadow_sd, ps.volume_stack); + ps, #endif - - bounce++; + isect, + ray, + &throughput)) + { + return true; } + /* Move ray forward. */ + ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng); + if(ray->t != FLT_MAX) { + ray->D = normalize_len(Pend - ray->P, &ray->t); + } + bounce++; + } +# ifdef __VOLUME__ + /* Attenuation for last line segment towards light. */ + if(ps->volume_stack[0].shader != SHADER_NONE) { + kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput); } +# endif + *shadow *= throughput; + return is_zero(throughput); } -#ifdef __VOLUME__ - else if(!blocked && state->volume_stack[0].shader != SHADER_NONE) { - /* apply attenuation from current volume shader */ +# ifdef __VOLUME__ + if(!blocked && state->volume_stack[0].shader != SHADER_NONE) { + /* Apply attenuation from current volume shader. */ kernel_volume_shadow(kg, shadow_sd, state, ray, shadow); } -#endif -#endif - +# endif return blocked; } +ccl_device bool shadow_blocked_transparent_stepped( + KernelGlobals *kg, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + const int skip_object, + Ray *ray, + Intersection *isect, + float3 *shadow) +{ + bool blocked, is_transparent_isect; + if (skip_object == OBJECT_NONE) { + blocked = scene_intersect(kg, + *ray, + PATH_RAY_SHADOW_OPAQUE, + isect, + NULL, + 0.0f, 0.0f); + is_transparent_isect = blocked + ? shader_transparent_shadow(kg, isect) + : false; + } + else { + blocked = false; + is_transparent_isect = false; + } + return shadow_blocked_transparent_stepped_loop(kg, + shadow_sd, + state, + skip_object, + ray, + isect, + blocked, + is_transparent_isect, + shadow); +} + +# endif /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */ +#endif /* __TRANSPARENT_SHADOWS__ */ + +ccl_device_inline bool shadow_blocked(KernelGlobals *kg, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + Ray *ray_input, + float3 *shadow) +{ + Ray *ray = ray_input; + Intersection isect; + /* Some common early checks. */ + *shadow = make_float3(1.0f, 1.0f, 1.0f); + if(ray->t == 0.0f) { + return false; + } +#ifdef __SHADOW_TRICKS__ + const int skip_object = state->catcher_object; +#else + const int skip_object = OBJECT_NONE; #endif + /* Do actual shadow shading. */ + /* First of all, we check if integrator requires transparent shadows. + * if not, we use simplest and fastest ever way to calculate occlusion. + * + * NOTE: We can't do quick opaque test here if we are on shadow-catcher + * path because we don't want catcher object to be casting shadow here. + */ +#ifdef __TRANSPARENT_SHADOWS__ + if(!kernel_data.integrator.transparent_shadows && + skip_object == OBJECT_NONE) +#endif + { + return shadow_blocked_opaque(kg, + shadow_sd, + state, + ray, + &isect, + shadow); + } +#ifdef __TRANSPARENT_SHADOWS__ +# ifdef __SHADOW_RECORD_ALL__ + /* For the transparent shadows we try to use record-all logic on the + * devices which supports this. + */ + const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce; + /* Check transparent bounces here, for volume scatter which can do + * lighting before surface path termination is checked. + */ + if(state->transparent_bounce >= transparent_max_bounce) { + return true; + } + const uint max_hits = transparent_max_bounce - state->transparent_bounce - 1; +# ifdef __KERNEL_GPU__ + /* On GPU we do trickey with tracing opaque ray first, this avoids speed + * regressions in some files. + * + * TODO(sergey): Check why using record-all behavior causes slowdown in such + * cases. Could that be caused by a higher spill pressure? + */ + const bool blocked = scene_intersect(kg, + *ray, + PATH_RAY_SHADOW_OPAQUE, + &isect, + NULL, + 0.0f, 0.0f); + const bool is_transparent_isect = blocked + ? shader_transparent_shadow(kg, &isect) + : false; + if(!blocked || !is_transparent_isect || + max_hits + 1 >= SHADOW_STACK_MAX_HITS) + { + return shadow_blocked_transparent_stepped_loop(kg, + shadow_sd, + state, + skip_object, + ray, + &isect, + blocked, + is_transparent_isect, + shadow); + } +# endif /* __KERNEL_GPU__ */ + return shadow_blocked_transparent_all(kg, + shadow_sd, + state, + skip_object, + ray, + max_hits, + shadow); +# else /* __SHADOW_RECORD_ALL__ */ + /* Fallback to a slowest version which works on all devices. */ + return shadow_blocked_transparent_stepped(kg, + shadow_sd, + state, + skip_object, + ray, + &isect, + shadow); +# endif /* __SHADOW_RECORD_ALL__ */ +#endif /* __TRANSPARENT_SHADOWS__ */ +} -CCL_NAMESPACE_END +#undef SHADOW_STACK_MAX_HITS +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h index c5652ebf7dc..6c8b7cca4ce 100644 --- a/intern/cycles/kernel/kernel_subsurface.h +++ b/intern/cycles/kernel/kernel_subsurface.h @@ -201,7 +201,7 @@ ccl_device float3 subsurface_color_pow(float3 color, float exponent) ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, ShaderData *sd, - PathState *state, + ccl_addr_space PathState *state, int state_flag, float3 *eval, float3 *N) @@ -239,7 +239,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect( SubsurfaceIntersection *ss_isect, ShaderData *sd, ShaderClosure *sc, - uint *lcg_state, + RNG *lcg_state, float disk_u, float disk_v, bool all) @@ -293,7 +293,12 @@ ccl_device_inline int subsurface_scatter_multi_intersect( float3 disk_P = (disk_r*cosf(phi)) * disk_T + (disk_r*sinf(phi)) * disk_B; /* create ray */ +#ifdef __SPLIT_KERNEL__ + Ray ray_object = ss_isect->ray; + Ray *ray = &ray_object; +#else Ray *ray = &ss_isect->ray; +#endif ray->P = sd->P + disk_N*disk_height + disk_P; ray->D = -disk_N; ray->t = 2.0f*disk_height; @@ -304,7 +309,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect( /* intersect with the same object. if multiple intersections are found it * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */ scene_intersect_subsurface(kg, - ray, + *ray, ss_isect, sd->object, lcg_state, @@ -314,20 +319,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect( for(int hit = 0; hit < num_eval_hits; hit++) { /* Quickly retrieve P and Ng without setting up ShaderData. */ float3 hit_P; - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { + if(sd->type & PRIMITIVE_TRIANGLE) { hit_P = triangle_refine_subsurface(kg, sd, &ss_isect->hits[hit], ray); } #ifdef __OBJECT_MOTION__ - else if(ccl_fetch(sd, type) & PRIMITIVE_MOTION_TRIANGLE) { + else if(sd->type & PRIMITIVE_MOTION_TRIANGLE) { float3 verts[3]; motion_triangle_vertices( kg, - ccl_fetch(sd, object), + sd->object, kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim), - ccl_fetch(sd, time), + sd->time, verts); hit_P = motion_triangle_refine_subsurface(kg, sd, @@ -367,6 +372,10 @@ ccl_device_inline int subsurface_scatter_multi_intersect( ss_isect->weight[hit] = eval; } +#ifdef __SPLIT_KERNEL__ + ss_isect->ray = *ray; +#endif + return num_eval_hits; } @@ -375,13 +384,19 @@ ccl_device_noinline void subsurface_scatter_multi_setup( SubsurfaceIntersection* ss_isect, int hit, ShaderData *sd, - PathState *state, + ccl_addr_space PathState *state, int state_flag, ShaderClosure *sc, bool all) { +#ifdef __SPLIT_KERNEL__ + Ray ray_object = ss_isect->ray; + Ray *ray = &ray_object; +#else + Ray *ray = &ss_isect->ray; +#endif /* Setup new shading point. */ - shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], &ss_isect->ray); + shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray); /* Optionally blur colors and bump mapping. */ float3 weight = ss_isect->weight[hit]; @@ -392,6 +407,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup( subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N); } +#ifndef __SPLIT_KERNEL__ /* subsurface scattering step, from a point on the surface to another nearby point on the same object */ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state, int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all) @@ -448,7 +464,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS /* intersect with the same object. if multiple intersections are * found it will randomly pick one of them */ SubsurfaceIntersection ss_isect; - scene_intersect_subsurface(kg, &ray, &ss_isect, sd->object, lcg_state, 1); + scene_intersect_subsurface(kg, ray, &ss_isect, sd->object, lcg_state, 1); /* evaluate bssrdf */ if(ss_isect.num_hits > 0) { @@ -481,6 +497,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS /* setup diffuse bsdf */ subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N); } +#endif /* ! __SPLIT_KERNEL__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index 8d5bb75a428..cb1a3f40dee 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -32,6 +32,7 @@ KERNEL_TEX(uint, texture_uint, __prim_visibility) KERNEL_TEX(uint, texture_uint, __prim_index) KERNEL_TEX(uint, texture_uint, __prim_object) KERNEL_TEX(uint, texture_uint, __object_node) +KERNEL_TEX(float2, texture_float2, __prim_time) /* objects */ KERNEL_TEX(float4, texture_float4, __objects) @@ -177,7 +178,6 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089) # else /* bindless textures */ diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 8c271c75e44..19c91248922 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -17,9 +17,9 @@ #ifndef __KERNEL_TYPES_H__ #define __KERNEL_TYPES_H__ -#include "kernel_math.h" -#include "svm/svm_types.h" -#include "util_static_assert.h" +#include "kernel/kernel_math.h" +#include "kernel/svm/svm_types.h" +#include "util/util_static_assert.h" #ifndef __KERNEL_GPU__ # define __KERNEL_CPU__ @@ -56,6 +56,8 @@ CCL_NAMESPACE_BEGIN #define VOLUME_STACK_SIZE 16 +#define WORK_POOL_SIZE 64 + /* device capabilities */ #ifdef __KERNEL_CPU__ # ifdef __KERNEL_SSE2__ @@ -63,27 +65,34 @@ CCL_NAMESPACE_BEGIN # endif # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# define __BRANCHED_PATH__ +# ifndef __SPLIT_KERNEL__ +# define __BRANCHED_PATH__ +# endif # ifdef WITH_OSL # define __OSL__ # endif # define __SUBSURFACE__ # define __CMJ__ # define __VOLUME__ -# define __VOLUME_DECOUPLED__ # define __VOLUME_SCATTER__ # define __SHADOW_RECORD_ALL__ -# define __VOLUME_RECORD_ALL__ +# ifndef __SPLIT_KERNEL__ +# define __VOLUME_DECOUPLED__ +# define __VOLUME_RECORD_ALL__ +# endif #endif /* __KERNEL_CPU__ */ #ifdef __KERNEL_CUDA__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# define __BRANCHED_PATH__ # define __VOLUME__ # define __VOLUME_SCATTER__ # define __SUBSURFACE__ -# define __CMJ__ +# define __SHADOW_RECORD_ALL__ +# ifndef __SPLIT_KERNEL__ +# define __BRANCHED_PATH__ +# define __CMJ__ +# endif #endif /* __KERNEL_CUDA__ */ #ifdef __KERNEL_OPENCL__ @@ -93,6 +102,10 @@ CCL_NAMESPACE_BEGIN # ifdef __KERNEL_OPENCL_NVIDIA__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ +# define __SUBSURFACE__ +# define __VOLUME__ +# define __VOLUME_SCATTER__ +# define __SHADOW_RECORD_ALL__ # ifdef __KERNEL_EXPERIMENTAL__ # define __CMJ__ # endif @@ -114,6 +127,10 @@ CCL_NAMESPACE_BEGIN # define __CL_USE_NATIVE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ +# define __SUBSURFACE__ +# define __VOLUME__ +# define __VOLUME_SCATTER__ +# define __SHADOW_RECORD_ALL__ # endif /* __KERNEL_OPENCL_AMD__ */ # ifdef __KERNEL_OPENCL_INTEL_CPU__ @@ -140,6 +157,7 @@ CCL_NAMESPACE_BEGIN #define __INTERSECTION_REFINE__ #define __CLAMP_SAMPLE__ #define __PATCH_EVAL__ +#define __SHADOW_TRICKS__ #ifdef __KERNEL_SHADING__ # define __SVM__ @@ -195,6 +213,9 @@ CCL_NAMESPACE_BEGIN #ifdef __NO_TRANSPARENT__ # undef __TRANSPARENT_SHADOWS__ #endif +#ifdef __NO_SHADOW_TRICKS__ +#undef __SHADOW_TRICKS__ +#endif /* Random Numbers */ @@ -299,6 +320,8 @@ enum PathRayFlag { PATH_RAY_MIS_SKIP = 4096, PATH_RAY_DIFFUSE_ANCESTOR = 8192, PATH_RAY_SINGLE_PASS_DONE = 16384, + PATH_RAY_SHADOW_CATCHER = 32768, + PATH_RAY_SHADOW_CATCHER_ONLY = 65536, }; /* Closure Label */ @@ -428,6 +451,20 @@ typedef ccl_addr_space struct PathRadiance { float4 shadow; float mist; #endif + +#ifdef __SHADOW_TRICKS__ + /* Total light reachable across the path, ignoring shadow blocked queries. */ + float3 path_total; + /* Total light reachable across the path with shadow blocked queries + * applied here. + * + * Dividing this figure by path_total will give estimate of shadow pass. + */ + float3 path_total_shaded; + + /* Color of the background on which shadow is alpha-overed. */ + float3 shadow_color; +#endif } PathRadiance; typedef struct BsdfEval { @@ -443,6 +480,9 @@ typedef struct BsdfEval { float3 subsurface; float3 scatter; #endif +#ifdef __SHADOW_TRICKS__ + float3 sum_no_mis; +#endif } BsdfEval; /* Shader Flag */ @@ -536,7 +576,7 @@ typedef struct Ray { /* Intersection */ -typedef ccl_addr_space struct Intersection { +typedef struct Intersection { float t, u, v; int prim; int object; @@ -788,108 +828,89 @@ enum ShaderDataObjectFlag { SD_OBJECT_INTERSECTS_VOLUME = (1 << 5), /* Has position for motion vertices. */ SD_OBJECT_HAS_VERTEX_MOTION = (1 << 6), + /* object is used to catch shadows */ + SD_OBJECT_SHADOW_CATCHER = (1 << 7), SD_OBJECT_FLAGS = (SD_OBJECT_HOLDOUT_MASK | SD_OBJECT_MOTION | SD_OBJECT_TRANSFORM_APPLIED | SD_OBJECT_NEGATIVE_SCALE_APPLIED | SD_OBJECT_HAS_VOLUME | - SD_OBJECT_INTERSECTS_VOLUME) + SD_OBJECT_INTERSECTS_VOLUME | + SD_OBJECT_SHADOW_CATCHER) }; -#ifdef __SPLIT_KERNEL__ -# define SD_THREAD (get_global_id(1) * get_global_size(0) + get_global_id(0)) -# if !defined(__SPLIT_KERNEL_SOA__) - /* ShaderData is stored as an Array-of-Structures */ -# define ccl_soa_member(type, name) type soa_##name -# define ccl_fetch(s, t) (s[SD_THREAD].soa_##t) -# define ccl_fetch_array(s, t, index) (&s[SD_THREAD].soa_##t[index]) -# else - /* ShaderData is stored as an Structure-of-Arrays */ -# define SD_GLOBAL_SIZE (get_global_size(0) * get_global_size(1)) -# define SD_FIELD_SIZE(t) sizeof(((struct ShaderData*)0)->t) -# define SD_OFFSETOF(t) ((char*)(&((struct ShaderData*)0)->t) - (char*)0) -# define ccl_soa_member(type, name) type soa_##name -# define ccl_fetch(s, t) (((ShaderData*)((ccl_addr_space char*)s + SD_GLOBAL_SIZE * SD_OFFSETOF(soa_##t) + SD_FIELD_SIZE(soa_##t) * SD_THREAD - SD_OFFSETOF(soa_##t)))->soa_##t) -# define ccl_fetch_array(s, t, index) (&ccl_fetch(s, t)[index]) -# endif -#else -# define ccl_soa_member(type, name) type name -# define ccl_fetch(s, t) (s->t) -# define ccl_fetch_array(s, t, index) (&s->t[index]) -#endif - typedef ccl_addr_space struct ShaderData { /* position */ - ccl_soa_member(float3, P); + float3 P; /* smooth normal for shading */ - ccl_soa_member(float3, N); + float3 N; /* true geometric normal */ - ccl_soa_member(float3, Ng); + float3 Ng; /* view/incoming direction */ - ccl_soa_member(float3, I); + float3 I; /* shader id */ - ccl_soa_member(int, shader); + int shader; /* booleans describing shader, see ShaderDataFlag */ - ccl_soa_member(int, flag); + int flag; /* booleans describing object of the shader, see ShaderDataObjectFlag */ - ccl_soa_member(int, object_flag); + int object_flag; /* primitive id if there is one, ~0 otherwise */ - ccl_soa_member(int, prim); + int prim; /* combined type and curve segment for hair */ - ccl_soa_member(int, type); + int type; /* parametric coordinates * - barycentric weights for triangles */ - ccl_soa_member(float, u); - ccl_soa_member(float, v); + float u; + float v; /* object id if there is one, ~0 otherwise */ - ccl_soa_member(int, object); + int object; /* motion blur sample time */ - ccl_soa_member(float, time); + float time; /* length of the ray being shaded */ - ccl_soa_member(float, ray_length); + float ray_length; #ifdef __RAY_DIFFERENTIALS__ /* differential of P. these are orthogonal to Ng, not N */ - ccl_soa_member(differential3, dP); + differential3 dP; /* differential of I */ - ccl_soa_member(differential3, dI); + differential3 dI; /* differential of u, v */ - ccl_soa_member(differential, du); - ccl_soa_member(differential, dv); + differential du; + differential dv; #endif #ifdef __DPDU__ /* differential of P w.r.t. parametric coordinates. note that dPdu is * not readily suitable as a tangent for shading on triangles. */ - ccl_soa_member(float3, dPdu); - ccl_soa_member(float3, dPdv); + float3 dPdu; + float3 dPdv; #endif #ifdef __OBJECT_MOTION__ /* object <-> world space transformations, cached to avoid * re-interpolating them constantly for shading */ - ccl_soa_member(Transform, ob_tfm); - ccl_soa_member(Transform, ob_itfm); + Transform ob_tfm; + Transform ob_itfm; #endif /* Closure data, we store a fixed array of closures */ - ccl_soa_member(struct ShaderClosure, closure[MAX_CLOSURE]); - ccl_soa_member(int, num_closure); - ccl_soa_member(int, num_closure_extra); - ccl_soa_member(float, randb_closure); - ccl_soa_member(float3, svm_closure_weight); + struct ShaderClosure closure[MAX_CLOSURE]; + int num_closure; + int num_closure_extra; + float randb_closure; + float3 svm_closure_weight; /* LCG state for closures that require additional random numbers. */ - ccl_soa_member(uint, lcg_state); + uint lcg_state; /* ray start position, only set for backgrounds */ - ccl_soa_member(float3, ray_P); - ccl_soa_member(differential3, ray_dP); + float3 ray_P; + differential3 ray_dP; #ifdef __OSL__ struct KernelGlobals *osl_globals; @@ -935,12 +956,16 @@ typedef struct PathState { RNG rng_congruential; VolumeStack volume_stack[VOLUME_STACK_SIZE]; #endif + +#ifdef __SHADOW_TRICKS__ + int catcher_object; +#endif } PathState; /* Subsurface */ /* Struct to gather multiple SSS hits. */ -struct SubsurfaceIntersection +typedef struct SubsurfaceIntersection { Ray ray; float3 weight[BSSRDF_MAX_HITS]; @@ -948,10 +973,10 @@ struct SubsurfaceIntersection int num_hits; struct Intersection hits[BSSRDF_MAX_HITS]; float3 Ng[BSSRDF_MAX_HITS]; -}; +} SubsurfaceIntersection; /* Struct to gather SSS indirect rays and delay tracing them. */ -struct SubsurfaceIndirectRays +typedef struct SubsurfaceIndirectRays { bool need_update_volume_stack; bool tracing; @@ -962,7 +987,7 @@ struct SubsurfaceIndirectRays struct Ray rays[BSSRDF_MAX_HITS]; float3 throughputs[BSSRDF_MAX_HITS]; struct PathRadiance L[BSSRDF_MAX_HITS]; -}; +} SubsurfaceIndirectRays; /* Constant Kernel Data * @@ -1201,7 +1226,8 @@ typedef struct KernelBVH { int have_curves; int have_instancing; int use_qbvh; - int pad1, pad2; + int use_bvh_steps; + int pad1; } KernelBVH; static_assert_align(KernelBVH, 16); @@ -1296,20 +1322,19 @@ enum QueueNumber { #define RAY_STATE_MASK 0x007 #define RAY_FLAG_MASK 0x0F8 enum RayState { + RAY_INVALID = 0, /* Denotes ray is actively involved in path-iteration. */ - RAY_ACTIVE = 0, + RAY_ACTIVE, /* Denotes ray has completed processing all samples and is inactive. */ - RAY_INACTIVE = 1, + RAY_INACTIVE, /* Denoted ray has exited path-iteration and needs to update output buffer. */ - RAY_UPDATE_BUFFER = 2, + RAY_UPDATE_BUFFER, /* Donotes ray has hit background */ - RAY_HIT_BACKGROUND = 3, + RAY_HIT_BACKGROUND, /* Denotes ray has to be regenerated */ - RAY_TO_REGENERATE = 4, + RAY_TO_REGENERATE, /* Denotes ray has been regenerated */ - RAY_REGENERATED = 5, - /* Denotes ray should skip direct lighting */ - RAY_SKIP_DL = 6, + RAY_REGENERATED, /* Flag's ray has to execute shadow blocked function in AO part */ RAY_SHADOW_RAY_CAST_AO = 16, /* Flag's ray has to execute shadow blocked function in direct lighting part. */ diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index c7cb29b5af2..9c0878249d4 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -38,7 +38,7 @@ typedef struct VolumeShaderCoefficients { /* evaluate shader to get extinction coefficient at P */ ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg, ShaderData *sd, - PathState *state, + ccl_addr_space PathState *state, float3 P, float3 *extinction) { @@ -64,7 +64,7 @@ ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg, /* evaluate shader to get absorption, scattering and emission at P */ ccl_device_inline bool volume_shader_sample(KernelGlobals *kg, ShaderData *sd, - PathState *state, + ccl_addr_space PathState *state, float3 P, VolumeShaderCoefficients *coeff) { @@ -112,7 +112,7 @@ ccl_device float kernel_volume_channel_get(float3 value, int channel) return (channel == 0)? value.x: ((channel == 1)? value.y: value.z); } -ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *stack) +ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, ccl_addr_space VolumeStack *stack) { for(int i = 0; stack[i].shader != SHADER_NONE; i++) { int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*SHADER_SIZE); @@ -161,7 +161,11 @@ ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stac /* homogeneous volume: assume shader evaluation at the starts gives * the extinction coefficient for the entire line segment */ -ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput) +ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + ShaderData *sd, + float3 *throughput) { float3 sigma_t; @@ -171,7 +175,11 @@ ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *s /* heterogeneous volume: integrate stepping through the volume until we * reach the end, get absorbed entirely, or run out of iterations */ -ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput) +ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + ShaderData *sd, + float3 *throughput) { float3 tp = *throughput; const float tp_eps = 1e-6f; /* todo: this is likely not the right value */ @@ -179,7 +187,7 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState /* prepare for stepping */ int max_steps = kernel_data.integrator.volume_max_steps; float step = kernel_data.integrator.volume_step_size; - float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step; + float random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * step; /* compute extinction at the start */ float t = 0.0f; @@ -193,7 +201,7 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState /* use random position inside this segment to sample shader */ if(new_t == ray->t) - random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt; + random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * dt; float3 new_P = ray->P + ray->D * (t + random_jitter_offset); float3 sigma_t; @@ -227,7 +235,11 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState /* get the volume attenuation over line segment defined by ray, with the * assumption that there are no surfaces blocking light between the endpoints */ -ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *throughput) +ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + Ray *ray, + float3 *throughput) { shader_setup_from_volume(kg, shadow_sd, ray); @@ -341,9 +353,15 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe /* homogeneous volume: assume shader evaluation at the start gives * the volume shading coefficient for the entire line segment */ -ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg, - PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, - RNG *rng, bool probalistic_scatter) +ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous( + KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + ShaderData *sd, + PathRadiance *L, + ccl_addr_space float3 *throughput, + RNG *rng, + bool probalistic_scatter) { VolumeShaderCoefficients coeff; @@ -444,8 +462,14 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba * volume until we reach the end, get absorbed entirely, or run out of * iterations. this does probabilistically scatter or get transmitted through * for path tracing where we don't want to branch. */ -ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg, - PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng) +ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( + KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + ShaderData *sd, + PathRadiance *L, + ccl_addr_space float3 *throughput, + RNG *rng) { float3 tp = *throughput; const float tp_eps = 1e-6f; /* todo: this is likely not the right value */ @@ -453,7 +477,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( /* prepare for stepping */ int max_steps = kernel_data.integrator.volume_max_steps; float step_size = kernel_data.integrator.volume_step_size; - float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size; + float random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * step_size; /* compute coefficients at the start */ float t = 0.0f; @@ -474,7 +498,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( /* use random position inside this segment to sample shader */ if(new_t == ray->t) - random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt; + random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * dt; float3 new_P = ray->P + ray->D * (t + random_jitter_offset); VolumeShaderCoefficients coeff; @@ -579,8 +603,15 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( * ray, with the assumption that there are no surfaces blocking light * between the endpoints. distance sampling is used to decide if we will * scatter or not. */ -ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg, - PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng, bool heterogeneous) +ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate( + KernelGlobals *kg, + ccl_addr_space PathState *state, + ShaderData *sd, + Ray *ray, + PathRadiance *L, + ccl_addr_space float3 *throughput, + RNG *rng, + bool heterogeneous) { shader_setup_from_volume(kg, sd, ray); @@ -590,6 +621,7 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, rng, true); } +#ifndef __SPLIT_KERNEL__ /* Decoupled Volume Sampling * * VolumeSegment is list of coefficients and transmittance stored at all steps @@ -966,7 +998,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( mis_weight = 2.0f*power_heuristic(pdf, distance_pdf); } } - if(sample_t < 1e-6f) { + if(sample_t < 1e-6f || pdf == 0.0f) { return VOLUME_PATH_SCATTERED; } @@ -990,6 +1022,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( return VOLUME_PATH_SCATTERED; } +#endif /* __SPLIT_KERNEL */ /* decide if we need to use decoupled or not */ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method) @@ -1021,9 +1054,9 @@ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneou ccl_device void kernel_volume_stack_init(KernelGlobals *kg, ShaderData *stack_sd, - const PathState *state, - const Ray *ray, - VolumeStack *stack) + ccl_addr_space const PathState *state, + ccl_addr_space const Ray *ray, + ccl_addr_space VolumeStack *stack) { /* NULL ray happens in the baker, does it need proper initialization of * camera in volume? @@ -1166,7 +1199,7 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg, } } -ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, VolumeStack *stack) +ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, ccl_addr_space VolumeStack *stack) { /* todo: we should have some way for objects to indicate if they want the * world shader to work inside them. excluding it by default is problematic @@ -1215,7 +1248,7 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg, ShaderData *stack_sd, Ray *ray, - VolumeStack *stack) + ccl_addr_space VolumeStack *stack) { kernel_assert(kernel_data.integrator.use_volumes); @@ -1277,7 +1310,7 @@ ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg, * the world's one after the last bounce to avoid render artifacts. */ ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg, - VolumeStack *volume_stack) + ccl_addr_space VolumeStack *volume_stack) { if(kernel_data.background.volume_shader != SHADER_NONE) { /* Keep the world's volume in stack. */ diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h index 7d559b1aa31..28fc5ce1c30 100644 --- a/intern/cycles/kernel/kernel_work_stealing.h +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -17,177 +17,102 @@ #ifndef __KERNEL_WORK_STEALING_H__ #define __KERNEL_WORK_STEALING_H__ +CCL_NAMESPACE_BEGIN + /* * Utility functions for work stealing */ -#ifdef __WORK_STEALING__ - #ifdef __KERNEL_OPENCL__ # pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #endif -uint get_group_id_with_ray_index(uint ray_index, - uint tile_dim_x, - uint tile_dim_y, - uint parallel_samples, - int dim) +ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg) +{ + return kernel_split_params.w * kernel_split_params.h * kernel_split_params.num_samples; +} + +ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg) +{ + return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE; +} + +ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint ray_index) +{ + return ray_index / WORK_POOL_SIZE; +} + +ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool) { - if(dim == 0) { - uint x_span = ray_index % (tile_dim_x * parallel_samples); - return x_span / get_local_size(0); + uint total_work_size = kernel_total_work_size(kg); + uint num_pools = kernel_num_work_pools(kg); + + if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= total_work_size) { + return 0; + } + + uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * WORK_POOL_SIZE; + + uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE)); + if(work_pool < remainder / WORK_POOL_SIZE) { + work_size += WORK_POOL_SIZE; } - else /*if(dim == 1)*/ { - kernel_assert(dim == 1); - uint y_span = ray_index / (tile_dim_x * parallel_samples); - return y_span / get_local_size(1); + else if(work_pool == remainder / WORK_POOL_SIZE) { + work_size += remainder % WORK_POOL_SIZE; } + + return work_size; } -uint get_total_work(uint tile_dim_x, - uint tile_dim_y, - uint grp_idx, - uint grp_idy, - uint num_samples) +ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint work_index, uint ray_index) { - uint threads_within_tile_border_x = - (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) - : get_local_size(0); - uint threads_within_tile_border_y = - (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) - : get_local_size(1); - - threads_within_tile_border_x = - (threads_within_tile_border_x == 0) ? get_local_size(0) - : threads_within_tile_border_x; - threads_within_tile_border_y = - (threads_within_tile_border_y == 0) ? get_local_size(1) - : threads_within_tile_border_y; - - return threads_within_tile_border_x * - threads_within_tile_border_y * - num_samples; + uint num_pools = kernel_num_work_pools(kg); + uint pool = work_pool_from_ray_index(kg, ray_index); + + return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE) + + (pool * WORK_POOL_SIZE) + + (work_index % WORK_POOL_SIZE); } -/* Returns 0 in case there is no next work available */ -/* Returns 1 in case work assigned is valid */ -int get_next_work(ccl_global uint *work_pool, - ccl_private uint *my_work, - uint tile_dim_x, - uint tile_dim_y, - uint num_samples, - uint parallel_samples, - uint ray_index) +/* Returns true if there is work */ +ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, uint ray_index) { - uint grp_idx = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 0); - uint grp_idy = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 1); - uint total_work = get_total_work(tile_dim_x, - tile_dim_y, - grp_idx, - grp_idy, - num_samples); - uint group_index = grp_idy * get_num_groups(0) + grp_idx; - *my_work = atomic_inc(&work_pool[group_index]); - return (*my_work < total_work) ? 1 : 0; + uint work_pool = work_pool_from_ray_index(kg, ray_index); + uint pool_size = work_pool_work_size(kg, work_pool); + + if(pool_size == 0) { + return false; + } + + *work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]); + return (*work_index < pool_size); } -/* This function assumes that the passed my_work is valid. */ -/* Decode sample number w.r.t. assigned my_work. */ -uint get_my_sample(uint my_work, - uint tile_dim_x, - uint tile_dim_y, - uint parallel_samples, - uint ray_index) +/* This function assumes that the passed `work` is valid. */ +/* Decode sample number w.r.t. assigned `work`. */ +ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint ray_index) { - uint grp_idx = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 0); - uint grp_idy = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 1); - uint threads_within_tile_border_x = - (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) - : get_local_size(0); - uint threads_within_tile_border_y = - (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) - : get_local_size(1); - - threads_within_tile_border_x = - (threads_within_tile_border_x == 0) ? get_local_size(0) - : threads_within_tile_border_x; - threads_within_tile_border_y = - (threads_within_tile_border_y == 0) ? get_local_size(1) - : threads_within_tile_border_y; - - return my_work / - (threads_within_tile_border_x * threads_within_tile_border_y); + return get_global_work_index(kg, work_index, ray_index) / (kernel_split_params.w * kernel_split_params.h); } -/* Decode pixel and tile position w.r.t. assigned my_work. */ -void get_pixel_tile_position(ccl_private uint *pixel_x, +/* Decode pixel and tile position w.r.t. assigned `work`. */ +ccl_device void get_work_pixel_tile_position(KernelGlobals *kg, + ccl_private uint *pixel_x, ccl_private uint *pixel_y, ccl_private uint *tile_x, ccl_private uint *tile_y, - uint my_work, - uint tile_dim_x, - uint tile_dim_y, - uint tile_offset_x, - uint tile_offset_y, - uint parallel_samples, + uint work_index, uint ray_index) { - uint grp_idx = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 0); - uint grp_idy = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 1); - uint threads_within_tile_border_x = - (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) - : get_local_size(0); - uint threads_within_tile_border_y = - (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) - : get_local_size(1); - - threads_within_tile_border_x = - (threads_within_tile_border_x == 0) ? get_local_size(0) - : threads_within_tile_border_x; - threads_within_tile_border_y = - (threads_within_tile_border_y == 0) ? get_local_size(1) - : threads_within_tile_border_y; - - uint total_associated_pixels = - threads_within_tile_border_x * threads_within_tile_border_y; - uint work_group_pixel_index = my_work % total_associated_pixels; - uint work_group_pixel_x = - work_group_pixel_index % threads_within_tile_border_x; - uint work_group_pixel_y = - work_group_pixel_index / threads_within_tile_border_x; - - *pixel_x = - tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x; - *pixel_y = - tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y; - *tile_x = *pixel_x - tile_offset_x; - *tile_y = *pixel_y - tile_offset_y; + uint pixel_index = get_global_work_index(kg, work_index, ray_index) % (kernel_split_params.w*kernel_split_params.h); + + *tile_x = pixel_index % kernel_split_params.w; + *tile_y = pixel_index / kernel_split_params.w; + + *pixel_x = *tile_x + kernel_split_params.x; + *pixel_y = *tile_y + kernel_split_params.y; } -#endif /* __WORK_STEALING__ */ +CCL_NAMESPACE_END #endif /* __KERNEL_WORK_STEALING_H__ */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp index 72dbbd9a416..16992c681e6 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp @@ -56,9 +56,9 @@ /* do nothing */ #endif -#include "kernel.h" +#include "kernel/kernel.h" #define KERNEL_ARCH cpu -#include "kernel_cpu_impl.h" +#include "kernel/kernels/cpu/kernel_cpu_impl.h" CCL_NAMESPACE_BEGIN @@ -90,7 +90,7 @@ void kernel_tex_copy(KernelGlobals *kg, kg->tname.width = width; \ } #define KERNEL_IMAGE_TEX(type, ttype, tname) -#include "kernel_textures.h" +#include "kernel/kernel_textures.h" else if(strstr(name, "__tex_image_float4")) { texture_image_float4 *tex = NULL; diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp index 1350d9e5c2e..2600d977972 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp @@ -28,10 +28,10 @@ # define __KERNEL_AVX__ #endif -#include "util_optimization.h" +#include "util/util_optimization.h" #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# include "kernel.h" +# include "kernel/kernel.h" # define KERNEL_ARCH cpu_avx -# include "kernel_cpu_impl.h" +# include "kernel/kernels/cpu/kernel_cpu_impl.h" #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp index 1a416e771ee..dba15d037ac 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp @@ -29,10 +29,10 @@ # define __KERNEL_AVX2__ #endif -#include "util_optimization.h" +#include "util/util_optimization.h" #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# include "kernel.h" +# include "kernel/kernel.h" # define KERNEL_ARCH cpu_avx2 -# include "kernel_cpu_impl.h" +# include "kernel/kernels/cpu/kernel_cpu_impl.h" #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index 1a07c705f1c..896b80d783e 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -49,4 +49,44 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, int offset, int sample); +/* Split kernels */ + +void KERNEL_FUNCTION_FULL_NAME(data_init)( + KernelGlobals *kg, + ccl_constant KernelData *data, + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + ccl_global uint *rng_state, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, + ccl_global int *Queue_index, + int queuesize, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, + ccl_global float *buffer); + +#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData *data); + +DECLARE_SPLIT_KERNEL_FUNCTION(path_init) +DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DECLARE_SPLIT_KERNEL_FUNCTION(do_volume) +DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue) +DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval) +DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) +DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) +DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting) +DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) +DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) +DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) +DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update) + +void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)); + #undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index ec82d4b4c22..148b2eef568 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -20,18 +20,45 @@ * simply includes this file without worry of copying actual implementation over. */ -#include "kernel_compat_cpu.h" -#include "kernel_math.h" -#include "kernel_types.h" -#include "kernel_globals.h" -#include "kernel_cpu_image.h" -#include "kernel_film.h" -#include "kernel_path.h" -#include "kernel_path_branched.h" -#include "kernel_bake.h" +#include "kernel/kernel_compat_cpu.h" + +#ifndef __SPLIT_KERNEL__ +# include "kernel/kernel_math.h" +# include "kernel/kernel_types.h" + +# include "kernel/split/kernel_split_data.h" +# include "kernel/kernel_globals.h" + +# include "kernel/kernels/cpu/kernel_cpu_image.h" +# include "kernel/kernel_film.h" +# include "kernel/kernel_path.h" +# include "kernel/kernel_path_branched.h" +# include "kernel/kernel_bake.h" +#else +# include "kernel/split/kernel_split_common.h" + +# include "kernel/split/kernel_data_init.h" +# include "kernel/split/kernel_path_init.h" +# include "kernel/split/kernel_scene_intersect.h" +# include "kernel/split/kernel_lamp_emission.h" +# include "kernel/split/kernel_do_volume.h" +# include "kernel/split/kernel_queue_enqueue.h" +# include "kernel/split/kernel_indirect_background.h" +# include "kernel/split/kernel_shader_eval.h" +# include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" +# include "kernel/split/kernel_subsurface_scatter.h" +# include "kernel/split/kernel_direct_lighting.h" +# include "kernel/split/kernel_shadow_blocked_ao.h" +# include "kernel/split/kernel_shadow_blocked_dl.h" +# include "kernel/split/kernel_next_iteration_setup.h" +# include "kernel/split/kernel_indirect_subsurface.h" +# include "kernel/split/kernel_buffer_update.h" +#endif CCL_NAMESPACE_BEGIN +#ifndef __SPLIT_KERNEL__ + /* Path Tracing */ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, @@ -131,4 +158,72 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, } } +#else /* __SPLIT_KERNEL__ */ + +/* Split Kernel Path Tracing */ + +#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + kernel_##name(kg); \ + } + +#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + ccl_local type locals; \ + kernel_##name(kg, &locals); \ + } + +DEFINE_SPLIT_KERNEL_FUNCTION(path_init) +DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) + +void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)) +{ +#define REGISTER_NAME_STRING(name) #name +#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name) +#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name)); + + REGISTER(path_trace); + REGISTER(convert_to_byte); + REGISTER(convert_to_half_float); + REGISTER(shader); + + REGISTER(data_init); + REGISTER(path_init); + REGISTER(scene_intersect); + REGISTER(lamp_emission); + REGISTER(do_volume); + REGISTER(queue_enqueue); + REGISTER(indirect_background); + REGISTER(shader_eval); + REGISTER(holdout_emission_blurring_pathtermination_ao); + REGISTER(subsurface_scatter); + REGISTER(direct_lighting); + REGISTER(shadow_blocked_ao); + REGISTER(shadow_blocked_dl); + REGISTER(next_iteration_setup); + REGISTER(indirect_subsurface); + REGISTER(buffer_update); + +#undef REGISTER +#undef REGISTER_EVAL_NAME +#undef REGISTER_NAME_STRING +} + +#endif /* __SPLIT_KERNEL__ */ + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp new file mode 100644 index 00000000000..ca750e5a00d --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp @@ -0,0 +1,63 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CPU kernel entry points */ + +/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this + * one with SSE2 intrinsics. + */ +#if defined(__x86_64__) || defined(_M_X64) +# define __KERNEL_SSE2__ +#endif + +#define __SPLIT_KERNEL__ + +/* When building kernel for native machine detect kernel features from the flags + * set by compiler. + */ +#ifdef WITH_KERNEL_NATIVE +# ifdef __SSE2__ +# ifndef __KERNEL_SSE2__ +# define __KERNEL_SSE2__ +# endif +# endif +# ifdef __SSE3__ +# define __KERNEL_SSE3__ +# endif +# ifdef __SSSE3__ +# define __KERNEL_SSSE3__ +# endif +# ifdef __SSE4_1__ +# define __KERNEL_SSE41__ +# endif +# ifdef __AVX__ +# define __KERNEL_AVX__ +# endif +# ifdef __AVX2__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX2__ +# endif +#endif + +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) + /* do nothing */ +#endif + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu +#include "kernel/kernels/cpu/kernel_cpu_impl.h" + diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp new file mode 100644 index 00000000000..27a746a0799 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +#endif + +#define __SPLIT_KERNEL__ + +#include "util/util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# include "kernel/kernel.h" +# define KERNEL_ARCH cpu_avx +# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp new file mode 100644 index 00000000000..364d279a189 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +#endif + +#define __SPLIT_KERNEL__ + +#include "util/util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# include "kernel/kernel.h" +# define KERNEL_ARCH cpu_avx2 +# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp new file mode 100644 index 00000000000..0afb481296f --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp @@ -0,0 +1,34 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +#endif + +#define __SPLIT_KERNEL__ + +#include "util/util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# include "kernel/kernel.h" +# define KERNEL_ARCH cpu_sse2 +# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp new file mode 100644 index 00000000000..13d00813591 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp @@ -0,0 +1,36 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +#endif + +#define __SPLIT_KERNEL__ + +#include "util/util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# include "kernel/kernel.h" +# define KERNEL_ARCH cpu_sse3 +# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp new file mode 100644 index 00000000000..a4312071edc --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp @@ -0,0 +1,37 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +#endif + +#define __SPLIT_KERNEL__ + +#include "util/util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# include "kernel/kernel.h" +# define KERNEL_ARCH cpu_sse41 +# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp index a5f2d6e7294..1acfaa91ac9 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp @@ -23,10 +23,10 @@ # define __KERNEL_SSE2__ #endif -#include "util_optimization.h" +#include "util/util_optimization.h" #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# include "kernel.h" +# include "kernel/kernel.h" # define KERNEL_ARCH cpu_sse2 -# include "kernel_cpu_impl.h" +# include "kernel/kernels/cpu/kernel_cpu_impl.h" #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp index 86f9ce991f8..f7b6a2e21fe 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp @@ -25,10 +25,10 @@ # define __KERNEL_SSSE3__ #endif -#include "util_optimization.h" +#include "util/util_optimization.h" #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# include "kernel.h" +# include "kernel/kernel.h" # define KERNEL_ARCH cpu_sse3 -# include "kernel_cpu_impl.h" +# include "kernel/kernels/cpu/kernel_cpu_impl.h" #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp index c174406047d..1900c6e3012 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp @@ -26,10 +26,10 @@ # define __KERNEL_SSE41__ #endif -#include "util_optimization.h" +#include "util/util_optimization.h" #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# include "kernel.h" +# include "kernel/kernel.h" # define KERNEL_ARCH cpu_sse41 -# include "kernel_cpu_impl.h" +# include "kernel/kernels/cpu//kernel_cpu_impl.h" #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index eb2b6ea5414..dc343cb387a 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -16,113 +16,19 @@ /* CUDA kernel entry points */ -#include "../../kernel_compat_cuda.h" -#include "../../kernel_math.h" -#include "../../kernel_types.h" -#include "../../kernel_globals.h" -#include "../../kernel_film.h" -#include "../../kernel_path.h" -#include "../../kernel_path_branched.h" -#include "../../kernel_bake.h" - -/* device data taken from CUDA occupancy calculator */ - #ifdef __CUDA_ARCH__ -/* 2.0 and 2.1 */ -#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 32 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40 - -/* 3.0 and 3.5 */ -#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.2 */ -#elif __CUDA_ARCH__ == 320 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.7 */ -#elif __CUDA_ARCH__ == 370 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 5.0, 5.2, 5.3, 6.0, 6.1 */ -#elif __CUDA_ARCH__ >= 500 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 48 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* unknown architecture */ -#else -# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" -#endif - -/* compute number of threads per block and minimum blocks per multiprocessor - * given the maximum number of registers per thread */ - -#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ - __launch_bounds__( \ - threads_block_width*threads_block_width, \ - CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \ - ) - -/* sanity checks */ - -#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS -# error "Maximum number of threads per block exceeded" -#endif - -#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS -# error "Maximum number of blocks per multiprocessor exceeded" -#endif - -#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif - -#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif +#include "kernel/kernel_compat_cuda.h" +#include "kernel_config.h" +#include "kernel/kernel_math.h" +#include "kernel/kernel_types.h" +#include "kernel/kernel_globals.h" +#include "kernel/kernel_film.h" +#include "kernel/kernel_path.h" +#include "kernel/kernel_path_branched.h" +#include "kernel/kernel_bake.h" /* kernels */ - extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) @@ -130,8 +36,10 @@ kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int s int x = sx + blockDim.x*blockIdx.x + threadIdx.x; int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - if(x < sx + sw && y < sy + sh) - kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride); + if(x < sx + sw && y < sy + sh) { + KernelGlobals kg; + kernel_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride); + } } #ifdef __BRANCHED_PATH__ @@ -142,8 +50,10 @@ kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int int x = sx + blockDim.x*blockIdx.x + threadIdx.x; int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - if(x < sx + sw && y < sy + sh) - kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride); + if(x < sx + sw && y < sy + sh) { + KernelGlobals kg; + kernel_branched_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride); + } } #endif @@ -154,8 +64,9 @@ kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int int x = sx + blockDim.x*blockIdx.x + threadIdx.x; int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - if(x < sx + sw && y < sy + sh) + if(x < sx + sw && y < sy + sh) { kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride); + } } extern "C" __global__ void @@ -165,8 +76,9 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal int x = sx + blockDim.x*blockIdx.x + threadIdx.x; int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - if(x < sx + sw && y < sy + sh) + if(x < sx + sw && y < sy + sh) { kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride); + } } extern "C" __global__ void @@ -183,7 +95,8 @@ kernel_cuda_shader(uint4 *input, int x = sx + blockDim.x*blockIdx.x + threadIdx.x; if(x < sx + sw) { - kernel_shader_evaluate(NULL, + KernelGlobals kg; + kernel_shader_evaluate(&kg, input, output, output_luma, @@ -200,8 +113,10 @@ kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - if(x < sx + sw) - kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, filter, x, offset, sample); + if(x < sx + sw) { + KernelGlobals kg; + kernel_bake_evaluate(&kg, input, output, (ShaderEvalType)type, filter, x, offset, sample); + } } #endif diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h new file mode 100644 index 00000000000..9fa39dc9ebb --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h @@ -0,0 +1,110 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* device data taken from CUDA occupancy calculator */ + +/* 2.0 and 2.1 */ +#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 32 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40 + +/* 3.0 and 3.5 */ +#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 3.2 */ +#elif __CUDA_ARCH__ == 320 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 3.7 */ +#elif __CUDA_ARCH__ == 370 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 5.0, 5.2, 5.3, 6.0, 6.1 */ +#elif __CUDA_ARCH__ >= 500 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 48 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* unknown architecture */ +#else +# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" +#endif + +/* compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread */ + +#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ + __launch_bounds__( \ + threads_block_width*threads_block_width, \ + CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \ + ) + +/* sanity checks */ + +#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS +# error "Maximum number of threads per block exceeded" +#endif + +#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS +# error "Maximum number of blocks per multiprocessor exceeded" +#endif + +#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif + +#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif + diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu new file mode 100644 index 00000000000..a679eff8409 --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -0,0 +1,144 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CUDA split kernel entry points */ + +#ifdef __CUDA_ARCH__ + +#define __SPLIT_KERNEL__ + +#include "kernel/kernel_compat_cuda.h" +#include "kernel_config.h" + +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_data_init.h" +#include "kernel/split/kernel_path_init.h" +#include "kernel/split/kernel_scene_intersect.h" +#include "kernel/split/kernel_lamp_emission.h" +#include "kernel/split/kernel_do_volume.h" +#include "kernel/split/kernel_queue_enqueue.h" +#include "kernel/split/kernel_indirect_background.h" +#include "kernel/split/kernel_shader_eval.h" +#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" +#include "kernel/split/kernel_subsurface_scatter.h" +#include "kernel/split/kernel_direct_lighting.h" +#include "kernel/split/kernel_shadow_blocked_ao.h" +#include "kernel/split/kernel_shadow_blocked_dl.h" +#include "kernel/split/kernel_next_iteration_setup.h" +#include "kernel/split/kernel_indirect_subsurface.h" +#include "kernel/split/kernel_buffer_update.h" + +#include "kernel/kernel_film.h" + +/* kernels */ +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size) +{ + *size = split_data_buffer_size(NULL, num_threads); +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_path_trace_data_init( + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + ccl_global uint *rng_state, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, + ccl_global int *Queue_index, + int queuesize, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, + ccl_global float *buffer) +{ + kernel_data_init(NULL, + NULL, + split_data_buffer, + num_elements, + ray_state, + rng_state, + start_sample, + end_sample, + sx, sy, sw, sh, offset, stride, + Queue_index, + queuesize, + use_queues_flag, + work_pool_wgs, + num_samples, + buffer); +} + +#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + extern "C" __global__ void \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \ + kernel_cuda_##name() \ + { \ + kernel_##name(NULL); \ + } + +#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ + extern "C" __global__ void \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \ + kernel_cuda_##name() \ + { \ + ccl_local type locals; \ + kernel_##name(NULL, &locals); \ + } + +DEFINE_SPLIT_KERNEL_FUNCTION(path_init) +DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride); +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride); +} + +#endif + diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl index a68f97857b6..078acc1631e 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel.cl @@ -16,34 +16,34 @@ /* OpenCL kernel entry points - unfinished */ -#include "../../kernel_compat_opencl.h" -#include "../../kernel_math.h" -#include "../../kernel_types.h" -#include "../../kernel_globals.h" -#include "../../kernel_image_opencl.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/kernel_math.h" +#include "kernel/kernel_types.h" +#include "kernel/kernel_globals.h" +#include "kernel/kernel_image_opencl.h" -#include "../../kernel_film.h" +#include "kernel/kernel_film.h" #if defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__) -# include "../../kernel_path.h" -# include "../../kernel_path_branched.h" +# include "kernel/kernel_path.h" +# include "kernel/kernel_path_branched.h" #else /* __COMPILE_ONLY_MEGAKERNEL__ */ /* Include only actually used headers for the case * when path tracing kernels are not needed. */ -# include "../../kernel_random.h" -# include "../../kernel_differential.h" -# include "../../kernel_montecarlo.h" -# include "../../kernel_projection.h" -# include "../../geom/geom.h" -# include "../../bvh/bvh.h" - -# include "../../kernel_accumulate.h" -# include "../../kernel_camera.h" -# include "../../kernel_shader.h" +# include "kernel/kernel_random.h" +# include "kernel/kernel_differential.h" +# include "kernel/kernel_montecarlo.h" +# include "kernel/kernel_projection.h" +# include "kernel/geom/geom.h" +# include "kernel/bvh/bvh.h" + +# include "kernel/kernel_accumulate.h" +# include "kernel/kernel_camera.h" +# include "kernel/kernel_shader.h" #endif /* defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__) */ -#include "../../kernel_bake.h" +#include "kernel/kernel_bake.h" #ifdef __COMPILE_ONLY_MEGAKERNEL__ @@ -54,7 +54,7 @@ __kernel void kernel_ocl_path_trace( #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, -#include "../../kernel_textures.h" +#include "kernel/kernel_textures.h" int sample, int sx, int sy, int sw, int sh, int offset, int stride) @@ -65,10 +65,10 @@ __kernel void kernel_ocl_path_trace( #define KERNEL_TEX(type, ttype, name) \ kg->name = name; -#include "../../kernel_textures.h" +#include "kernel/kernel_textures.h" - int x = sx + get_global_id(0); - int y = sy + get_global_id(1); + int x = sx + ccl_global_id(0); + int y = sy + ccl_global_id(1); if(x < sx + sw && y < sy + sh) kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); @@ -84,7 +84,7 @@ __kernel void kernel_ocl_shader( #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, -#include "../../kernel_textures.h" +#include "kernel/kernel_textures.h" int type, int sx, int sw, int offset, int sample) { @@ -94,9 +94,9 @@ __kernel void kernel_ocl_shader( #define KERNEL_TEX(type, ttype, name) \ kg->name = name; -#include "../../kernel_textures.h" +#include "kernel/kernel_textures.h" - int x = sx + get_global_id(0); + int x = sx + ccl_global_id(0); if(x < sx + sw) { kernel_shader_evaluate(kg, @@ -116,7 +116,7 @@ __kernel void kernel_ocl_bake( #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, -#include "../../kernel_textures.h" +#include "kernel/kernel_textures.h" int type, int filter, int sx, int sw, int offset, int sample) { @@ -126,9 +126,9 @@ __kernel void kernel_ocl_bake( #define KERNEL_TEX(type, ttype, name) \ kg->name = name; -#include "../../kernel_textures.h" +#include "kernel/kernel_textures.h" - int x = sx + get_global_id(0); + int x = sx + ccl_global_id(0); if(x < sx + sw) { #ifdef __NO_BAKING__ @@ -146,7 +146,7 @@ __kernel void kernel_ocl_convert_to_byte( #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, -#include "../../kernel_textures.h" +#include "kernel/kernel_textures.h" float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) @@ -157,10 +157,10 @@ __kernel void kernel_ocl_convert_to_byte( #define KERNEL_TEX(type, ttype, name) \ kg->name = name; -#include "../../kernel_textures.h" +#include "kernel/kernel_textures.h" - int x = sx + get_global_id(0); - int y = sy + get_global_id(1); + int x = sx + ccl_global_id(0); + int y = sy + ccl_global_id(1); if(x < sx + sw && y < sy + sh) kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); @@ -173,7 +173,7 @@ __kernel void kernel_ocl_convert_to_half_float( #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, -#include "../../kernel_textures.h" +#include "kernel/kernel_textures.h" float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) @@ -184,13 +184,29 @@ __kernel void kernel_ocl_convert_to_half_float( #define KERNEL_TEX(type, ttype, name) \ kg->name = name; -#include "../../kernel_textures.h" +#include "kernel/kernel_textures.h" - int x = sx + get_global_id(0); - int y = sy + get_global_id(1); + int x = sx + ccl_global_id(0); + int y = sy + ccl_global_id(1); if(x < sx + sw && y < sy + sh) kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); } +__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, ulong size, ulong offset) +{ + size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0); + + if(i < size / sizeof(float4)) { + buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + else if(i == size / sizeof(float4)) { + ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)]; + + for(i = 0; i < size % sizeof(float4); i++) { + *(b++) = 0; + } + } +} + #endif /* __COMPILE_ONLY_MEGAKERNEL__ */ diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl deleted file mode 100644 index 1914d241eb1..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "split/kernel_background_buffer_update.h" - -__kernel void kernel_ocl_path_trace_background_buffer_update( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* Required for buffer Update */ - ccl_global float3 *throughput_coop, /* Required for background hit processing */ - PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */ - ccl_global Ray *Ray_coop, /* Required for background hit processing */ - ccl_global PathState *PathState_coop, /* Required for background hit processing */ - ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */ - ccl_global char *ray_state, /* Stores information on the current state of a ray */ - int sw, int sh, int sx, int sy, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global unsigned int *work_array, /* Denotes work of each ray */ - ccl_global int *Queue_data, /* Queues memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ - int end_sample, - int start_sample, -#ifdef __WORK_STEALING__ - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ -{ - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - if(ray_index == 0) { - /* We will empty this queue in this kernel. */ - Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; - } - char enqueue_flag = 0; - ray_index = get_ray_index(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - Queue_data, - queuesize, - 1); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - enqueue_flag = - kernel_background_buffer_update((KernelGlobals *)kg, - per_sample_output_buffers, - rng_state, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - L_transparent_coop, - ray_state, - sw, sh, sx, sy, stride, - rng_state_offset_x, - rng_state_offset_y, - rng_state_stride, - work_array, - end_sample, - start_sample, -#ifdef __WORK_STEALING__ - work_pool_wgs, - num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - debugdata_coop, -#endif - parallel_samples, - ray_index); -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - - /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; - * These rays will be made active during next SceneIntersectkernel. - */ - enqueue_ray_index_local(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl new file mode 100644 index 00000000000..db65c91baf7 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl @@ -0,0 +1,27 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_buffer_update.h" + +__kernel void kernel_ocl_path_trace_buffer_update( + ccl_global char *kg, + ccl_constant KernelData *data) +{ + ccl_local unsigned int local_queue_atomics; + kernel_buffer_update((KernelGlobals*)kg, &local_queue_atomics); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl index 18139687eab..8b85d362f8a 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl @@ -14,77 +14,49 @@ * limitations under the License. */ -#include "split/kernel_data_init.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_data_init.h" __kernel void kernel_ocl_path_trace_data_init( - ccl_global char *globals, - ccl_global char *sd_DL_shadow, + ccl_global char *kg, ccl_constant KernelData *data, - ccl_global float *per_sample_output_buffers, + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ - ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ - ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ - PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ - ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ - ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ - Intersection *Intersection_coop_shadow, - ccl_global char *ray_state, /* Stores information on current state of a ray */ #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, -#include "../../kernel_textures.h" +#include "kernel/kernel_textures.h" - int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global int *Queue_data, /* Memory for queues */ + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, ccl_global int *Queue_index, /* Tracks the number of elements in queues */ int queuesize, /* size (capacity) of the queue */ ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ - ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ -#ifdef __WORK_STEALING__ ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ unsigned int num_samples, /* Total number of samples per pixel */ -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ + ccl_global float *buffer) { - kernel_data_init((KernelGlobals *)globals, - (ShaderData *)sd_DL_shadow, + kernel_data_init((KernelGlobals*)kg, data, - per_sample_output_buffers, - rng_state, - rng_coop, - throughput_coop, - L_transparent_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - Intersection_coop_shadow, + split_data_buffer, + num_elements, ray_state, + rng_state, #define KERNEL_TEX(type, ttype, name) name, -#include "../../kernel_textures.h" +#include "kernel/kernel_textures.h" - start_sample, sx, sy, sw, sh, offset, stride, - rng_state_offset_x, - rng_state_offset_y, - rng_state_stride, - Queue_data, + start_sample, + end_sample, + sx, sy, sw, sh, offset, stride, Queue_index, queuesize, use_queues_flag, - work_array, -#ifdef __WORK_STEALING__ work_pool_wgs, num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - debugdata_coop, -#endif - parallel_samples); + buffer); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl index c6a2c8d050c..eb34f750881 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl @@ -14,74 +14,14 @@ * limitations under the License. */ -#include "split/kernel_direct_lighting.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_direct_lighting.h" __kernel void kernel_ocl_path_trace_direct_lighting( ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Required for direct lighting */ - ccl_global uint *rng_coop, /* Required for direct lighting */ - ccl_global PathState *PathState_coop, /* Required for direct lighting */ - ccl_global int *ISLamp_coop, /* Required for direct lighting */ - ccl_global Ray *LightRay_coop, /* Required for direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize) /* Size (capacity) of each queue */ + ccl_constant KernelData *data) { ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - char enqueue_flag = 0; - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - enqueue_flag = kernel_direct_lighting((KernelGlobals *)kg, - (ShaderData *)sd, - rng_coop, - PathState_coop, - ISLamp_coop, - LightRay_coop, - BSDFEval_coop, - ray_state, - ray_index); - -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - -#ifdef __EMISSION__ - /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_SHADOW_RAY_CAST_DL_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); -#endif + kernel_direct_lighting((KernelGlobals*)kg, &local_queue_atomics); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl new file mode 100644 index 00000000000..83ef5f5f3f2 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_do_volume.h" + +__kernel void kernel_ocl_path_trace_do_volume( + ccl_global char *kg, + ccl_constant KernelData *data) +{ + kernel_do_volume((KernelGlobals*)kg); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl index e063614da1a..d071b39aa6f 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl @@ -14,110 +14,16 @@ * limitations under the License. */ -#include "split/kernel_holdout_emission_blurring_pathtermination_ao.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" __kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao( ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Required throughout the kernel except probabilistic path termination and AO */ - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */ - ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */ - ccl_global float *L_transparent_coop, /* Required for handling holdout material */ - PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */ - ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */ - Intersection *Intersection_coop, /* Required for indirect primitive emission */ - ccl_global float3 *AOAlpha_coop, /* Required for AO */ - ccl_global float3 *AOBSDF_coop, /* Required for AO */ - ccl_global Ray *AOLightRay_coop, /* Required for AO */ - int sw, int sh, int sx, int sy, int stride, - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */ - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ -#ifdef __WORK_STEALING__ - unsigned int start_sample, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ + ccl_constant KernelData *data) { - ccl_local unsigned int local_queue_atomics_bg; - ccl_local unsigned int local_queue_atomics_ao; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics_bg = 0; - local_queue_atomics_ao = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - char enqueue_flag = 0; - char enqueue_flag_AO_SHADOW_RAY_CAST = 0; - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif /* __COMPUTE_DEVICE_GPU__ */ - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - kernel_holdout_emission_blurring_pathtermination_ao( - (KernelGlobals *)kg, - (ShaderData *)sd, - per_sample_output_buffers, - rng_coop, - throughput_coop, - L_transparent_coop, - PathRadiance_coop, - PathState_coop, - Intersection_coop, - AOAlpha_coop, - AOBSDF_coop, - AOLightRay_coop, - sw, sh, sx, sy, stride, - ray_state, - work_array, -#ifdef __WORK_STEALING__ - start_sample, -#endif - parallel_samples, - ray_index, - &enqueue_flag, - &enqueue_flag_AO_SHADOW_RAY_CAST); -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - - /* Enqueue RAY_UPDATE_BUFFER rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics_bg, - Queue_data, - Queue_index); - -#ifdef __AO__ - /* Enqueue to-shadow-ray-cast rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_SHADOW_RAY_CAST_AO_RAYS, - enqueue_flag_AO_SHADOW_RAY_CAST, - queuesize, - &local_queue_atomics_ao, - Queue_data, - Queue_index); -#endif + ccl_local BackgroundAOLocals locals; + kernel_holdout_emission_blurring_pathtermination_ao( + (KernelGlobals*)kg, + &locals); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl new file mode 100644 index 00000000000..8c213ff5cb2 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_indirect_background.h" + +__kernel void kernel_ocl_path_trace_indirect_background( + ccl_global char *kg, + ccl_constant KernelData *data) +{ + kernel_indirect_background((KernelGlobals*)kg); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl new file mode 100644 index 00000000000..998ebc4c0c3 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_indirect_subsurface.h" + +__kernel void kernel_ocl_path_trace_indirect_subsurface( + ccl_global char *kg, + ccl_constant KernelData *data) +{ + kernel_indirect_subsurface((KernelGlobals*)kg); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl index 267bddc2ffc..822d2287715 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl @@ -14,67 +14,13 @@ * limitations under the License. */ -#include "split/kernel_lamp_emission.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_lamp_emission.h" __kernel void kernel_ocl_path_trace_lamp_emission( ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global float3 *throughput_coop, /* Required for lamp emission */ - PathRadiance *PathRadiance_coop, /* Required for lamp emission */ - ccl_global Ray *Ray_coop, /* Required for lamp emission */ - ccl_global PathState *PathState_coop, /* Required for lamp emission */ - Intersection *Intersection_coop, /* Required for lamp emission */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global int *Queue_data, /* Memory for queues */ - ccl_global int *Queue_index, /* Tracks the number of elements in queues */ - int queuesize, /* Size (capacity) of queues */ - ccl_global char *use_queues_flag, /* Used to decide if this kernel should use - * queues to fetch ray index - */ - int parallel_samples) /* Number of samples to be processed in parallel */ + ccl_constant KernelData *data) { - int x = get_global_id(0); - int y = get_global_id(1); - - /* We will empty this queue in this kernel. */ - if(get_global_id(0) == 0 && get_global_id(1) == 0) { - Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; - } - /* Fetch use_queues_flag. */ - ccl_local char local_use_queues_flag; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_use_queues_flag = use_queues_flag[0]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index; - if(local_use_queues_flag) { - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(thread_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 1); - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } - } else { - if(x < (sw * parallel_samples) && y < sh) { - ray_index = x + y * (sw * parallel_samples); - } else { - return; - } - } - - kernel_lamp_emission((KernelGlobals *)kg, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - sw, sh, - use_queues_flag, - ray_index); + kernel_lamp_emission((KernelGlobals*)kg); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl index 6d49b6294a8..6d207253a40 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl @@ -14,101 +14,14 @@ * limitations under the License. */ -#include "split/kernel_next_iteration_setup.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_next_iteration_setup.h" __kernel void kernel_ocl_path_trace_next_iteration_setup( ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Required for setting up ray for next iteration */ - ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */ - ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */ - PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */ - ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */ - ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */ - ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ - ccl_global char *use_queues_flag) /* flag to decide if scene_intersect kernel should - * use queues to fetch ray index */ + ccl_constant KernelData *data) { ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(get_global_id(0) == 0 && get_global_id(1) == 0) { - /* If we are here, then it means that scene-intersect kernel - * has already been executed atleast once. From the next time, - * scene-intersect kernel may operate on queues to fetch ray index - */ - use_queues_flag[0] = 1; - - /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and - * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the - * previous kernel. - */ - Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; - } - - char enqueue_flag = 0; - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - enqueue_flag = kernel_next_iteration_setup((KernelGlobals *)kg, - (ShaderData *)sd, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - LightRay_dl_coop, - ISLamp_coop, - BSDFEval_coop, - LightRay_ao_coop, - AOBSDF_coop, - AOAlpha_coop, - ray_state, - use_queues_flag, - ray_index); -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - - /* Enqueue RAY_UPDATE_BUFFER rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); + kernel_next_iteration_setup((KernelGlobals*)kg, &local_queue_atomics); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl new file mode 100644 index 00000000000..bd9aa9538c8 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_path_init.h" + +__kernel void kernel_ocl_path_trace_path_init( + ccl_global char *kg, + ccl_constant KernelData *data) +{ + kernel_path_init((KernelGlobals*)kg); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl index 3156dc255fb..9be154e3d75 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl @@ -14,93 +14,14 @@ * limitations under the License. */ -#include "../../kernel_compat_opencl.h" -#include "../../kernel_math.h" -#include "../../kernel_types.h" -#include "../../kernel_globals.h" -#include "../../kernel_queues.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_queue_enqueue.h" -/* - * The kernel "kernel_queue_enqueue" enqueues rays of - * different ray state into their appropriate Queues; - * 1. Rays that have been determined to hit the background from the - * "kernel_scene_intersect" kernel - * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; - * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS. - * - * The input and output of the kernel is as follows, - * - * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| | - * queuesize -------------------------------------------| | - * - * Note on Queues : - * State of queues during the first time this kernel is called : - * At entry, - * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays - * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays. - * - * State of queue during other times this kernel is called : - * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays. - */ __kernel void kernel_ocl_path_trace_queue_enqueue( - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int queuesize) /* Size (capacity) of each queue */ + ccl_global char *kg, + ccl_constant KernelData *data) { - /* We have only 2 cases (Hit/Not-Hit) */ - ccl_local unsigned int local_queue_atomics[2]; - - int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0); - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - - if(lidx < 2 ) { - local_queue_atomics[lidx] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int queue_number = -1; - - if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { - queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; - } - else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS; - } - - unsigned int my_lqidx; - if(queue_number != -1) { - my_lqidx = get_local_queue_index(queue_number, local_queue_atomics); - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(lidx == 0) { - local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = - get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, - local_queue_atomics, - Queue_index); - local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = - get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - local_queue_atomics, - Queue_index); - } - barrier(CLK_LOCAL_MEM_FENCE); - - unsigned int my_gqidx; - if(queue_number != -1) { - my_gqidx = get_global_queue_index(queue_number, - queuesize, - my_lqidx, - local_queue_atomics); - Queue_data[my_gqidx] = ray_index; - } + ccl_local QueueEnqueueLocals locals; + kernel_queue_enqueue((KernelGlobals*)kg, &locals); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl index 7f3f433c7a6..eb4fb4d153a 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl @@ -14,67 +14,13 @@ * limitations under the License. */ -#include "split/kernel_scene_intersect.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_scene_intersect.h" __kernel void kernel_ocl_path_trace_scene_intersect( ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global uint *rng_coop, - ccl_global Ray *Ray_coop, /* Required for scene_intersect */ - ccl_global PathState *PathState_coop, /* Required for scene_intersect */ - Intersection *Intersection_coop, /* Required for scene_intersect */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global int *Queue_data, /* Memory for queues */ - ccl_global int *Queue_index, /* Tracks the number of elements in queues */ - int queuesize, /* Size (capacity) of queues */ - ccl_global char *use_queues_flag, /* used to decide if this kernel should use - * queues to fetch ray index */ -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ + ccl_constant KernelData *data) { - int x = get_global_id(0); - int y = get_global_id(1); - - /* Fetch use_queues_flag */ - ccl_local char local_use_queues_flag; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_use_queues_flag = use_queues_flag[0]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index; - if(local_use_queues_flag) { - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(thread_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } - } else { - if(x < (sw * parallel_samples) && y < sh) { - ray_index = x + y * (sw * parallel_samples); - } else { - return; - } - } - - kernel_scene_intersect((KernelGlobals *)kg, - rng_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - sw, sh, - use_queues_flag, -#ifdef __KERNEL_DEBUG__ - debugdata_coop, -#endif - ray_index); + kernel_scene_intersect((KernelGlobals*)kg); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl index c37856c8f30..6baee460986 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl @@ -14,55 +14,14 @@ * limitations under the License. */ -#include "split/kernel_shader_eval.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shader_eval.h" __kernel void kernel_ocl_path_trace_shader_eval( ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Output ShaderData structure to be filled */ - ccl_global uint *rng_coop, /* Required for rbsdf calculation */ - ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */ - ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */ - Intersection *Intersection_coop, /* Required for setting up shader from ray */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global int *Queue_data, /* queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize) /* Size (capacity) of each queue */ + ccl_constant KernelData *data) { - /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } - - char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0; - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); - - /* Continue on with shader evaluation. */ - kernel_shader_eval((KernelGlobals *)kg, - (ShaderData *)sd, - rng_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - ray_index); + kernel_shader_eval((KernelGlobals*)kg, &local_queue_atomics); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl deleted file mode 100644 index edf76fba714..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "split/kernel_shadow_blocked.h" - -__kernel void kernel_ocl_path_trace_shadow_blocked( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global PathState *PathState_coop, /* Required for shadow blocked */ - ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */ - ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */ - ccl_global char *ray_state, - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize) /* Size (capacity) of each queue */ -{ - int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0); - - ccl_local unsigned int ao_queue_length; - ccl_local unsigned int dl_queue_length; - if(lidx == 0) { - ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS]; - dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - /* flag determining if the current ray is to process shadow ray for AO or DL */ - char shadow_blocked_type = -1; - - int ray_index = QUEUE_EMPTY_SLOT; - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - if(thread_index < ao_queue_length + dl_queue_length) { - if(thread_index < ao_queue_length) { - ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1); - shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO; - } else { - ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1); - shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL; - } - } - - if(ray_index == QUEUE_EMPTY_SLOT) - return; - - kernel_shadow_blocked((KernelGlobals *)kg, - PathState_coop, - LightRay_dl_coop, - LightRay_ao_coop, - ray_state, - shadow_blocked_type, - ray_index); -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl new file mode 100644 index 00000000000..6a8ef81b32a --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shadow_blocked_ao.h" + +__kernel void kernel_ocl_path_trace_shadow_blocked_ao( + ccl_global char *kg, + ccl_constant KernelData *data) +{ + kernel_shadow_blocked_ao((KernelGlobals*)kg); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl new file mode 100644 index 00000000000..b255cc5ef8b --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shadow_blocked_dl.h" + +__kernel void kernel_ocl_path_trace_shadow_blocked_dl( + ccl_global char *kg, + ccl_constant KernelData *data) +{ + kernel_shadow_blocked_dl((KernelGlobals*)kg); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl new file mode 100644 index 00000000000..732cda30115 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl @@ -0,0 +1,35 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernels/opencl/kernel_state_buffer_size.cl" +#include "kernel/kernels/opencl/kernel_data_init.cl" +#include "kernel/kernels/opencl/kernel_path_init.cl" + +#include "kernel/kernels/opencl/kernel_scene_intersect.cl" +#include "kernel/kernels/opencl/kernel_lamp_emission.cl" +#include "kernel/kernels/opencl/kernel_do_volume.cl" +#include "kernel/kernels/opencl/kernel_indirect_background.cl" +#include "kernel/kernels/opencl/kernel_queue_enqueue.cl" +#include "kernel/kernels/opencl/kernel_shader_eval.cl" +#include "kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" +#include "kernel/kernels/opencl/kernel_subsurface_scatter.cl" +#include "kernel/kernels/opencl/kernel_direct_lighting.cl" +#include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl" +#include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl" +#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl" +#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl" +#include "kernel/kernels/opencl/kernel_buffer_update.cl" + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl new file mode 100644 index 00000000000..c10ecc426c6 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl @@ -0,0 +1,29 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" + +__kernel void kernel_ocl_path_trace_state_buffer_size( + ccl_global char *kg, + ccl_constant KernelData *data, + uint num_threads, + ccl_global uint64_t *size) +{ + ((KernelGlobals*)kg)->data = data; + *size = split_data_buffer_size((KernelGlobals*)kg, num_threads); +} + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl new file mode 100644 index 00000000000..7a1838e485f --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl @@ -0,0 +1,27 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_subsurface_scatter.h" + +__kernel void kernel_ocl_path_trace_subsurface_scatter( + ccl_global char *kg, + ccl_constant KernelData *data) +{ + ccl_local unsigned int local_queue_atomics; + kernel_subsurface_scatter((KernelGlobals*)kg, &local_queue_atomics); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl deleted file mode 100644 index 88a1ed830af..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "split/kernel_sum_all_radiance.h" - -__kernel void kernel_ocl_path_trace_sum_all_radiance( - ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */ - ccl_global float *buffer, /* Output buffer of RenderTile */ - ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */ - int parallel_samples, int sw, int sh, int stride, - int buffer_offset_x, - int buffer_offset_y, - int buffer_stride, - int start_sample) -{ - kernel_sum_all_radiance(data, - buffer, - per_sample_output_buffer, - parallel_samples, - sw, sh, stride, - buffer_offset_x, - buffer_offset_y, - buffer_stride, - start_sample); -} diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt index 98de40e5a8a..d2eb89e0e0a 100644 --- a/intern/cycles/kernel/osl/CMakeLists.txt +++ b/intern/cycles/kernel/osl/CMakeLists.txt @@ -1,12 +1,6 @@ set(INC - . - .. - ../svm - ../../graph - ../../render - ../../util - ../../device + ../.. ) set(INC_SYS diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp index d835f9be45c..2e73e7a601e 100644 --- a/intern/cycles/kernel/osl/background.cpp +++ b/intern/cycles/kernel/osl/background.cpp @@ -34,10 +34,10 @@ #include <OSL/genclosure.h> -#include "osl_closures.h" +#include "kernel/osl/osl_closures.h" -#include "kernel_compat_cpu.h" -#include "closure/alloc.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/closure/alloc.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp index bc26f42b559..ea18f2c8c86 100644 --- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp +++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp @@ -34,13 +34,13 @@ #include <OSL/genclosure.h> -#include "kernel_compat_cpu.h" -#include "osl_closures.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/osl/osl_closures.h" -#include "kernel_types.h" -#include "kernel_montecarlo.h" -#include "closure/alloc.h" -#include "closure/bsdf_diffuse_ramp.h" +#include "kernel/kernel_types.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_diffuse_ramp.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp index 14c7644936e..a26671eb09e 100644 --- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp +++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp @@ -34,12 +34,12 @@ #include <OSL/genclosure.h> -#include "kernel_compat_cpu.h" -#include "osl_closures.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/osl/osl_closures.h" -#include "kernel_types.h" -#include "closure/alloc.h" -#include "closure/bsdf_phong_ramp.h" +#include "kernel/kernel_types.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_phong_ramp.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp index 3f13e08b302..8843a196dad 100644 --- a/intern/cycles/kernel/osl/emissive.cpp +++ b/intern/cycles/kernel/osl/emissive.cpp @@ -34,12 +34,12 @@ #include <OSL/genclosure.h> -#include "osl_closures.h" +#include "kernel/osl/osl_closures.h" -#include "kernel_compat_cpu.h" -#include "kernel_types.h" -#include "closure/alloc.h" -#include "closure/emissive.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/kernel_types.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/emissive.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp index 44daefee249..188c3960a5f 100644 --- a/intern/cycles/kernel/osl/osl_bssrdf.cpp +++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp @@ -32,17 +32,17 @@ #include <OSL/genclosure.h> -#include "kernel_compat_cpu.h" -#include "osl_closures.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/osl/osl_closures.h" -#include "kernel_types.h" -#include "kernel_montecarlo.h" +#include "kernel/kernel_types.h" +#include "kernel/kernel_montecarlo.h" -#include "closure/alloc.h" -#include "closure/bsdf_util.h" -#include "closure/bsdf_diffuse.h" -#include "closure/bsdf_principled_diffuse.h" -#include "closure/bssrdf.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_util.h" +#include "kernel/closure/bsdf_diffuse.h" +#include "kernel/closure/bsdf_principled_diffuse.h" +#include "kernel/closure/bssrdf.h" CCL_NAMESPACE_BEGIN @@ -81,7 +81,7 @@ public: bssrdf->sharpness = sharpness; bssrdf->N = params.N; bssrdf->roughness = params.roughness; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f)); @@ -93,7 +93,7 @@ public: bssrdf->sharpness = sharpness; bssrdf->N = params.N; bssrdf->roughness = params.roughness; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z)); @@ -105,7 +105,7 @@ public: bssrdf->sharpness = sharpness; bssrdf->N = params.N; bssrdf->roughness = params.roughness; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } } } diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp index 5570a22692e..5b66793a05d 100644 --- a/intern/cycles/kernel/osl/osl_closures.cpp +++ b/intern/cycles/kernel/osl/osl_closures.cpp @@ -33,35 +33,36 @@ #include <OSL/genclosure.h> #include <OSL/oslclosure.h> -#include "osl_closures.h" -#include "osl_shader.h" - -#include "util_debug.h" -#include "util_math.h" -#include "util_param.h" - -#include "kernel_types.h" -#include "kernel_compat_cpu.h" -#include "kernel_globals.h" -#include "kernel_montecarlo.h" -#include "kernel_random.h" - -#include "closure/alloc.h" -#include "closure/bsdf_util.h" -#include "closure/bsdf_ashikhmin_velvet.h" -#include "closure/bsdf_diffuse.h" -#include "closure/bsdf_microfacet.h" -#include "closure/bsdf_microfacet_multi.h" -#include "closure/bsdf_oren_nayar.h" -#include "closure/bsdf_reflection.h" -#include "closure/bsdf_refraction.h" -#include "closure/bsdf_transparent.h" -#include "closure/bsdf_ashikhmin_shirley.h" -#include "closure/bsdf_toon.h" -#include "closure/bsdf_hair.h" -#include "closure/bsdf_principled_diffuse.h" -#include "closure/bsdf_principled_sheen.h" -#include "closure/volume.h" +#include "kernel/osl/osl_closures.h" +#include "kernel/osl/osl_shader.h" + +#include "util/util_debug.h" +#include "util/util_math.h" +#include "util/util_param.h" + +#include "kernel/kernel_types.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/split/kernel_split_data_types.h" +#include "kernel/kernel_globals.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_random.h" + +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_util.h" +#include "kernel/closure/bsdf_ashikhmin_velvet.h" +#include "kernel/closure/bsdf_diffuse.h" +#include "kernel/closure/bsdf_microfacet.h" +#include "kernel/closure/bsdf_microfacet_multi.h" +#include "kernel/closure/bsdf_oren_nayar.h" +#include "kernel/closure/bsdf_reflection.h" +#include "kernel/closure/bsdf_refraction.h" +#include "kernel/closure/bsdf_transparent.h" +#include "kernel/closure/bsdf_ashikhmin_shirley.h" +#include "kernel/closure/bsdf_toon.h" +#include "kernel/closure/bsdf_hair.h" +#include "kernel/closure/bsdf_principled_diffuse.h" +#include "kernel/closure/bsdf_principled_sheen.h" +#include "kernel/closure/volume.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h index c9740f81c8a..ff5fd9cc905 100644 --- a/intern/cycles/kernel/osl/osl_closures.h +++ b/intern/cycles/kernel/osl/osl_closures.h @@ -33,8 +33,8 @@ #ifndef __OSL_CLOSURES_H__ #define __OSL_CLOSURES_H__ -#include "util_types.h" -#include "kernel_types.h" +#include "util/util_types.h" +#include "kernel/kernel_types.h" #include <OSL/oslclosure.h> #include <OSL/oslexec.h> diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h index 65cb7ecc6b4..02c083a83f8 100644 --- a/intern/cycles/kernel/osl/osl_globals.h +++ b/intern/cycles/kernel/osl/osl_globals.h @@ -21,10 +21,10 @@ #include <OSL/oslexec.h> -#include "util_map.h" -#include "util_param.h" -#include "util_thread.h" -#include "util_vector.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_thread.h" +#include "util/util_vector.h" #ifndef WIN32 using std::isfinite; diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index 58bbdc33920..b767c60c617 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -25,33 +25,34 @@ #include <string.h> -#include "mesh.h" -#include "object.h" -#include "scene.h" - -#include "osl_closures.h" -#include "osl_globals.h" -#include "osl_services.h" -#include "osl_shader.h" - -#include "util_foreach.h" -#include "util_logging.h" -#include "util_string.h" - -#include "kernel_compat_cpu.h" -#include "kernel_globals.h" -#include "kernel_random.h" -#include "kernel_projection.h" -#include "kernel_differential.h" -#include "kernel_montecarlo.h" -#include "kernel_camera.h" -#include "kernels/cpu/kernel_cpu_image.h" -#include "geom/geom.h" -#include "bvh/bvh.h" - -#include "kernel_projection.h" -#include "kernel_accumulate.h" -#include "kernel_shader.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" + +#include "kernel/osl/osl_closures.h" +#include "kernel/osl/osl_globals.h" +#include "kernel/osl/osl_services.h" +#include "kernel/osl/osl_shader.h" + +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_string.h" + +#include "kernel/kernel_compat_cpu.h" +#include "kernel/split/kernel_split_data_types.h" +#include "kernel/kernel_globals.h" +#include "kernel/kernel_random.h" +#include "kernel/kernel_projection.h" +#include "kernel/kernel_differential.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_camera.h" +#include "kernel/kernels/cpu/kernel_cpu_image.h" +#include "kernel/geom/geom.h" +#include "kernel/bvh/bvh.h" + +#include "kernel/kernel_projection.h" +#include "kernel/kernel_accumulate.h" +#include "kernel/kernel_shader.h" #ifdef WITH_PTEX # include <Ptexture.h> diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp index 0d762bbdb38..13b19d86eca 100644 --- a/intern/cycles/kernel/osl/osl_shader.cpp +++ b/intern/cycles/kernel/osl/osl_shader.cpp @@ -16,21 +16,22 @@ #include <OSL/oslexec.h> -#include "kernel_compat_cpu.h" -#include "kernel_montecarlo.h" -#include "kernel_types.h" -#include "kernel_globals.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_types.h" +#include "kernel/split/kernel_split_data_types.h" +#include "kernel/kernel_globals.h" -#include "geom/geom_object.h" +#include "kernel/geom/geom_object.h" -#include "osl_closures.h" -#include "osl_globals.h" -#include "osl_services.h" -#include "osl_shader.h" +#include "kernel/osl/osl_closures.h" +#include "kernel/osl/osl_globals.h" +#include "kernel/osl/osl_services.h" +#include "kernel/osl/osl_shader.h" -#include "util_foreach.h" +#include "util/util_foreach.h" -#include "attribute.h" +#include "render/attribute.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h index ad06dd6929d..32121e940b4 100644 --- a/intern/cycles/kernel/osl/osl_shader.h +++ b/intern/cycles/kernel/osl/osl_shader.h @@ -29,7 +29,7 @@ * This means no thread state must be passed along in the kernel itself. */ -#include "kernel_types.h" +#include "kernel/kernel_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h deleted file mode 100644 index 9bfa71c75ef..00000000000 --- a/intern/cycles/kernel/split/kernel_background_buffer_update.h +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel_split_common.h" - -/* Note on kernel_background_buffer_update kernel. - * This is the fourth kernel in the ray tracing logic, and the third - * of the path iteration kernels. This kernel takes care of rays that hit - * the background (sceneintersect kernel), and for the rays of - * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in - * the output buffer. This kernel also takes care of rays that have been determined - * to-be-regenerated. - * - * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel - * - * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER - * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state - * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS. - * - * The input and output are as follows, - * - * rng_coop ---------------------------------------------|--- kernel_background_buffer_update --|--- PathRadiance_coop - * throughput_coop --------------------------------------| |--- L_transparent_coop - * per_sample_output_buffers ----------------------------| |--- per_sample_output_buffers - * Ray_coop ---------------------------------------------| |--- ray_state - * PathState_coop ---------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * L_transparent_coop -----------------------------------| |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) - * ray_state --------------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) - * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- work_array - * parallel_samples -------------------------------------| |--- PathState_coop - * end_sample -------------------------------------------| |--- throughput_coop - * kg (globals) -----------------------------------------| |--- rng_coop - * rng_state --------------------------------------------| |--- Ray - * PathRadiance_coop ------------------------------------| | - * sw ---------------------------------------------------| | - * sh ---------------------------------------------------| | - * sx ---------------------------------------------------| | - * sy ---------------------------------------------------| | - * stride -----------------------------------------------| | - * work_array -------------------------------------------| |--- work_array - * queuesize --------------------------------------------| | - * start_sample -----------------------------------------| |--- work_pool_wgs - * work_pool_wgs ----------------------------------------| | - * num_samples ------------------------------------------| | - * - * note on sd : sd argument is neither an input nor an output for this kernel. It is just filled and consumed here itself. - * Note on Queues : - * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. - * - * State of queues when this kernel is called : - * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty - */ -ccl_device char kernel_background_buffer_update( - KernelGlobals *kg, - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* Required for buffer Update */ - ccl_global float3 *throughput_coop, /* Required for background hit processing */ - PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */ - ccl_global Ray *Ray_coop, /* Required for background hit processing */ - ccl_global PathState *PathState_coop, /* Required for background hit processing */ - ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */ - ccl_global char *ray_state, /* Stores information on the current state of a ray */ - int sw, int sh, int sx, int sy, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global unsigned int *work_array, /* Denotes work of each ray */ - int end_sample, - int start_sample, -#ifdef __WORK_STEALING__ - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples, /* Number of samples to be processed in parallel */ - int ray_index) -{ - char enqueue_flag = 0; -#ifdef __KERNEL_DEBUG__ - DebugData *debug_data = &debugdata_coop[ray_index]; -#endif - ccl_global PathState *state = &PathState_coop[ray_index]; - PathRadiance *L = L = &PathRadiance_coop[ray_index]; - ccl_global Ray *ray = &Ray_coop[ray_index]; - ccl_global float3 *throughput = &throughput_coop[ray_index]; - ccl_global float *L_transparent = &L_transparent_coop[ray_index]; - ccl_global uint *rng = &rng_coop[ray_index]; - -#ifdef __WORK_STEALING__ - unsigned int my_work; - ccl_global float *initial_per_sample_output_buffers; - ccl_global uint *initial_rng; -#endif - unsigned int sample; - unsigned int tile_x; - unsigned int tile_y; - unsigned int pixel_x; - unsigned int pixel_y; - unsigned int my_sample_tile; - -#ifdef __WORK_STEALING__ - my_work = work_array[ray_index]; - sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - get_pixel_tile_position(&pixel_x, &pixel_y, - &tile_x, &tile_y, - my_work, - sw, sh, sx, sy, - parallel_samples, - ray_index); - my_sample_tile = 0; - initial_per_sample_output_buffers = per_sample_output_buffers; - initial_rng = rng_state; -#else /* __WORK_STEALING__ */ - sample = work_array[ray_index]; - int tile_index = ray_index / parallel_samples; - /* buffer and rng_state's stride is "stride". Find x and y using ray_index */ - tile_x = tile_index % sw; - tile_y = tile_index / sw; - my_sample_tile = ray_index - (tile_index * parallel_samples); -#endif /* __WORK_STEALING__ */ - - rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; - per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; - - if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { - /* eval background shader if nothing hit */ - if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) { - *L_transparent = (*L_transparent) + average((*throughput)); -#ifdef __PASSES__ - if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) -#endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - } - - if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { -#ifdef __BACKGROUND__ - /* sample background shader */ - float3 L_background = indirect_background(kg, kg->sd_input, state, ray); - path_radiance_accum_background(L, (*throughput), L_background, state->bounce); -#endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - } - } - - if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - float3 L_sum = path_radiance_clamp_and_sum(kg, L); - kernel_write_light_passes(kg, per_sample_output_buffers, L, sample); -#ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample); -#endif - float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent)); - - /* accumulate result in output buffer */ - kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); - path_rng_end(kg, rng_state, *rng); - - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); - } - - if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { -#ifdef __WORK_STEALING__ - /* We have completed current work; So get next work */ - int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); - if(!valid_work) { - /* If work is invalid, this means no more work is available and the thread may exit */ - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); - } -#else /* __WORK_STEALING__ */ - if((sample + parallel_samples) >= end_sample) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); - } -#endif /* __WORK_STEALING__ */ - - if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { -#ifdef __WORK_STEALING__ - work_array[ray_index] = my_work; - /* Get the sample associated with the current work */ - sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - /* Get pixel and tile position associated with current work */ - get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); - my_sample_tile = 0; - - /* Remap rng_state according to the current work */ - rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride); - /* Remap per_sample_output_buffers according to the current work */ - per_sample_output_buffers = initial_per_sample_output_buffers - + (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; -#else /* __WORK_STEALING__ */ - work_array[ray_index] = sample + parallel_samples; - sample = work_array[ray_index]; - - /* Get ray position from ray index */ - pixel_x = sx + ((ray_index / parallel_samples) % sw); - pixel_y = sy + ((ray_index / parallel_samples) / sw); -#endif /* __WORK_STEALING__ */ - - /* Initialize random numbers and ray. */ - kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray); - - if(ray->t != 0.0f) { - /* Initialize throughput, L_transparent, Ray, PathState; - * These rays proceed with path-iteration. - */ - *throughput = make_float3(1.0f, 1.0f, 1.0f); - *L_transparent = 0.0f; - path_radiance_init(L, kernel_data.film.use_light_pass); - path_state_init(kg, kg->sd_input, state, rng, sample, ray); -#ifdef __KERNEL_DEBUG__ - debug_data_init(debug_data); -#endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - enqueue_flag = 1; - } - else { - /* These rays do not participate in path-iteration. */ - float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - /* Accumulate result in output buffer. */ - kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); - path_rng_end(kg, rng_state, *rng); - - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); - } - } - } - return enqueue_flag; -} diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h new file mode 100644 index 00000000000..859c221d976 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_buffer_update.h @@ -0,0 +1,206 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* This kernel takes care of rays that hit the background (sceneintersect + * kernel), and for the rays of state RAY_UPDATE_BUFFER it updates the ray's + * accumulated radiance in the output buffer. This kernel also takes care of + * rays that have been determined to-be-regenerated. + * + * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel. + * + * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER + * will be eventually set to RAY_TO_REGENERATE state in this kernel. + * Finally all rays of ray_state RAY_TO_REGENERATE will be regenerated and put + * in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * + * State of queues when this kernel is called: + * At entry, + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays. + * At exit, + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and + * RAY_REGENERATED rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. + */ +ccl_device void kernel_buffer_update(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) +{ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(ray_index == 0) { + /* We will empty this queue in this kernel. */ + kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + } + char enqueue_flag = 0; + ray_index = get_ray_index(kg, ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + + ccl_global uint *rng_state = kernel_split_params.rng_state; + int stride = kernel_split_params.stride; + + ccl_global char *ray_state = kernel_split_state.ray_state; +#ifdef __KERNEL_DEBUG__ + DebugData *debug_data = &kernel_split_state.debug_data[ray_index]; +#endif + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + ccl_global float *buffer = kernel_split_params.buffer; + + unsigned int work_index; + ccl_global uint *initial_rng; + + unsigned int sample; + unsigned int tile_x; + unsigned int tile_y; + unsigned int pixel_x; + unsigned int pixel_y; + + work_index = kernel_split_state.work_array[ray_index]; + sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; + get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, + &tile_x, &tile_y, + work_index, + ray_index); + initial_rng = rng_state; + + rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride; + buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; + + if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { + float3 L_sum; +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent); + } + else +#endif /* __SHADOW_TRICKS__ */ + { + L_sum = path_radiance_clamp_and_sum(kg, L); + } + kernel_write_light_passes(kg, buffer, L, sample); +#ifdef __KERNEL_DEBUG__ + kernel_write_debug_passes(kg, buffer, state, debug_data, sample); +#endif + float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent)); + + /* accumulate result in output buffer */ + kernel_write_pass_float4(buffer, sample, L_rad); + path_rng_end(kg, rng_state, rng); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } + + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { + /* We have completed current work; So get next work */ + int valid_work = get_next_work(kg, &work_index, ray_index); + if(!valid_work) { + /* If work is invalid, this means no more work is available and the thread may exit */ + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } + + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { + kernel_split_state.work_array[ray_index] = work_index; + /* Get the sample associated with the current work */ + sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; + /* Get pixel and tile position associated with current work */ + get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index); + + /* Remap rng_state according to the current work */ + rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*stride; + /* Remap buffer according to the current work */ + buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; + + /* Initialize random numbers and ray. */ + kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, &rng, ray); + + if(ray->t != 0.0f) { + /* Initialize throughput, L_transparent, Ray, PathState; + * These rays proceed with path-iteration. + */ + *throughput = make_float3(1.0f, 1.0f, 1.0f); + *L_transparent = 0.0f; + path_radiance_init(L, kernel_data.film.use_light_pass); + path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &rng, sample, ray); +#ifdef __SUBSURFACE__ + kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]); +#endif +#ifdef __KERNEL_DEBUG__ + debug_data_init(debug_data); +#endif + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + enqueue_flag = 1; + } + else { + /* These rays do not participate in path-iteration. */ + float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + /* Accumulate result in output buffer. */ + kernel_write_pass_float4(buffer, sample, L_rad); + path_rng_end(kg, rng_state, rng); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } + } + } + kernel_split_state.rng[ray_index] = rng; + +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; + * These rays will be made active during next SceneIntersectkernel. + */ + enqueue_ray_index_local(ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h index 6e158d53d23..9d3d01fff75 100644 --- a/intern/cycles/kernel/split/kernel_data_init.h +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -14,108 +14,105 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_data_initialization kernel - * This kernel Initializes structures needed in path-iteration kernels. - * This is the first kernel in ray-tracing logic. +/* This kernel Initializes structures needed in path-iteration kernels. * - * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE - * - * Its input and output are as follows, - * - * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng - * Un-initialized throughput -------| |--- Initialized throughput - * Un-initialized L_transparent ----| |--- Initialized L_transparent - * Un-initialized PathRadiance -----| |--- Initialized PathRadiance - * Un-initialized Ray --------------| |--- Initialized Ray - * Un-initialized PathState --------| |--- Initialized PathState - * Un-initialized QueueData --------| |--- Initialized QueueData (to QUEUE_EMPTY_SLOT) - * Un-initialized QueueIndex -------| |--- Initialized QueueIndex (to 0) - * Un-initialized use_queues_flag---| |--- Initialized use_queues_flag (to false) - * Un-initialized ray_state --------| |--- Initialized ray_state - * parallel_samples --------------- | |--- Initialized per_sample_output_buffers - * rng_state -----------------------| |--- Initialized work_array - * data ----------------------------| |--- Initialized work_pool_wgs - * start_sample --------------------| | - * sx ------------------------------| | - * sy ------------------------------| | - * sw ------------------------------| | - * sh ------------------------------| | - * stride --------------------------| | - * queuesize -----------------------| | - * num_samples ---------------------| | - * - * Note on Queues : + * Note on Queues: * All slots in queues are initialized to queue empty slot; * The number of elements in the queues is initialized to 0; */ + +/* Distributes an amount of work across all threads + * note: work done inside the loop may not show up to all threads till after + * the current kernel has completed + */ +#define parallel_for(kg, iter_name, work_size) \ + for(size_t _size = (work_size), \ + _global_size = ccl_global_size(0) * ccl_global_size(1), \ + _n = _size / _global_size, \ + _thread = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0), \ + iter_name = (_n > 0) ? (_thread * _n) : (_thread) \ + ; \ + (iter_name < (_thread+1) * _n) || (iter_name == _n * _global_size + _thread && _thread < _size % _global_size) \ + ; \ + iter_name = (iter_name != (_thread+1) * _n - 1) ? (iter_name + 1) : (_n * _global_size + _thread) \ + ) + +#ifndef __KERNEL_CPU__ ccl_device void kernel_data_init( +#else +void KERNEL_FUNCTION_FULL_NAME(data_init)( +#endif KernelGlobals *kg, - ShaderData *sd_DL_shadow, ccl_constant KernelData *data, - ccl_global float *per_sample_output_buffers, + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ - ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ - ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ - PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ - ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ - ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ - Intersection *Intersection_coop_shadow, - ccl_global char *ray_state, /* Stores information on current state of a ray */ +#ifdef __KERNEL_OPENCL__ #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, -#include "../kernel_textures.h" +#include "kernel/kernel_textures.h" +#endif - int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global int *Queue_data, /* Memory for queues */ + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, ccl_global int *Queue_index, /* Tracks the number of elements in queues */ int queuesize, /* size (capacity) of the queue */ ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ - ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ -#ifdef __WORK_STEALING__ - ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ - unsigned int num_samples, /* Total number of samples per pixel */ -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ + ccl_global unsigned int *work_pools, /* Work pool for each work group */ + unsigned int num_samples, + ccl_global float *buffer) { +#ifdef __KERNEL_OPENCL__ kg->data = data; - kg->sd_input = sd_DL_shadow; - kg->isect_shadow = Intersection_coop_shadow; +#endif + + kernel_split_params.x = sx; + kernel_split_params.y = sy; + kernel_split_params.w = sw; + kernel_split_params.h = sh; + + kernel_split_params.offset = offset; + kernel_split_params.stride = stride; + + kernel_split_params.rng_state = rng_state; + + kernel_split_params.start_sample = start_sample; + kernel_split_params.end_sample = end_sample; + + kernel_split_params.work_pools = work_pools; + kernel_split_params.num_samples = num_samples; + + kernel_split_params.queue_index = Queue_index; + kernel_split_params.queue_size = queuesize; + kernel_split_params.use_queues_flag = use_queues_flag; + + kernel_split_params.buffer = buffer; + + split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state); + +#ifdef __KERNEL_OPENCL__ #define KERNEL_TEX(type, ttype, name) \ kg->name = name; -#include "../kernel_textures.h" - - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); +#include "kernel/kernel_textures.h" +#endif -#ifdef __WORK_STEALING__ - int lid = get_local_id(1) * get_local_size(0) + get_local_id(0); - /* Initialize work_pool_wgs */ - if(lid == 0) { - int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0); - work_pool_wgs[group_index] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); -#endif /* __WORK_STEALING__ */ + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); /* Initialize queue data and queue index. */ if(thread_index < queuesize) { /* Initialize active ray queue. */ - Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; /* Initialize background and buffer update queue. */ - Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; /* Initialize shadow ray cast of AO queue. */ - Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; /* Initialize shadow ray cast of direct lighting queue. */ - Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; } if(thread_index == 0) { @@ -126,109 +123,31 @@ ccl_device void kernel_data_init( /* The scene-intersect kernel should not use the queues very first time. * since the queue would be empty. */ - use_queues_flag[0] = 0; + *use_queues_flag = 0; } - int x = get_global_id(0); - int y = get_global_id(1); + /* zero the tiles pixels and initialize rng_state if this is the first sample */ + if(start_sample == 0) { + parallel_for(kg, i, sw * sh * kernel_data.film.pass_stride) { + int pixel = i / kernel_data.film.pass_stride; + int pass = i % kernel_data.film.pass_stride; - if(x < (sw * parallel_samples) && y < sh) { - int ray_index = x + y * (sw * parallel_samples); + int x = sx + pixel % sw; + int y = sy + pixel / sw; - /* This is the first assignment to ray_state; - * So we dont use ASSIGN_RAY_STATE macro. - */ - ray_state[ray_index] = RAY_ACTIVE; - - unsigned int my_sample; - unsigned int pixel_x; - unsigned int pixel_y; - unsigned int tile_x; - unsigned int tile_y; - unsigned int my_sample_tile; - -#ifdef __WORK_STEALING__ - unsigned int my_work = 0; - /* Get work. */ - get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); - /* Get the sample associated with the work. */ - my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - - my_sample_tile = 0; - - /* Get pixel and tile position associated with the work. */ - get_pixel_tile_position(&pixel_x, &pixel_y, - &tile_x, &tile_y, - my_work, - sw, sh, sx, sy, - parallel_samples, - ray_index); - work_array[ray_index] = my_work; -#else /* __WORK_STEALING__ */ - unsigned int tile_index = ray_index / parallel_samples; - tile_x = tile_index % sw; - tile_y = tile_index / sw; - my_sample_tile = ray_index - (tile_index * parallel_samples); - my_sample = my_sample_tile + start_sample; - - /* Initialize work array. */ - work_array[ray_index] = my_sample ; - - /* Calculate pixel position of this ray. */ - pixel_x = sx + tile_x; - pixel_y = sy + tile_y; -#endif /* __WORK_STEALING__ */ - - rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; - - /* Initialise per_sample_output_buffers to all zeros. */ - per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride; - int per_sample_output_buffers_iterator = 0; - for(per_sample_output_buffers_iterator = 0; - per_sample_output_buffers_iterator < kernel_data.film.pass_stride; - per_sample_output_buffers_iterator++) - { - per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f; - } + int index = (offset + x + y*stride) * kernel_data.film.pass_stride + pass; - /* Initialize random numbers and ray. */ - kernel_path_trace_setup(kg, - rng_state, - my_sample, - pixel_x, pixel_y, - &rng_coop[ray_index], - &Ray_coop[ray_index]); - - if(Ray_coop[ray_index].t != 0.0f) { - /* Initialize throughput, L_transparent, Ray, PathState; - * These rays proceed with path-iteration. - */ - throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f); - L_transparent_coop[ray_index] = 0.0f; - path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass); - path_state_init(kg, - kg->sd_input, - &PathState_coop[ray_index], - &rng_coop[ray_index], - my_sample, - &Ray_coop[ray_index]); -#ifdef __KERNEL_DEBUG__ - debug_data_init(&debugdata_coop[ray_index]); -#endif + *(buffer + index) = 0.0f; } - else { - /* These rays do not participate in path-iteration. */ - float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - /* Accumulate result in output buffer. */ - kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad); - path_rng_end(kg, rng_state, rng_coop[ray_index]); - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); - } - } - /* Mark rest of the ray-state indices as RAY_INACTIVE. */ - if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) { - /* First assignment, hence we dont use ASSIGN_RAY_STATE macro */ - ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE; + parallel_for(kg, i, sw * sh) { + int x = sx + i % sw; + int y = sy + i / sw; + + int index = (offset + x + y*stride); + *(rng_state + index) = hash_int_2d(x, y); + } } } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h index 82ca18829d3..bdbf7387b95 100644 --- a/intern/cycles/kernel/split/kernel_direct_lighting.h +++ b/intern/cycles/kernel/split/kernel_direct_lighting.h @@ -14,95 +14,144 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_direct_lighting kernel. - * This is the eighth kernel in the ray tracing logic. This is the seventh - * of the path iteration kernels. This kernel takes care of direct lighting - * logic. However, the "shadow ray cast" part of direct lighting is handled +/* This kernel takes care of direct lighting logic. + * However, the "shadow ray cast" part of direct lighting is handled * in the next kernel. * - * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed. - * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and - * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS + * This kernels determines the rays for which a shadow_blocked() function + * associated with direct lighting should be executed. Those rays for which + * a shadow_blocked() function for direct-lighting must be executed, are + * marked with flag RAY_SHADOW_RAY_CAST_DL and enqueued into the queue + * QUEUE_SHADOW_RAY_CAST_DL_RAYS * - * The input and output are as follows, + * Note on Queues: + * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue + * and processes only the rays of state RAY_ACTIVE; If a ray needs to execute + * the corresponding shadow_blocked part, after direct lighting, the ray is + * marked with RAY_SHADOW_RAY_CAST_DL flag. * - * rng_coop -----------------------------------------|--- kernel_direct_lighting --|--- BSDFEval_coop - * PathState_coop -----------------------------------| |--- ISLamp_coop - * sd -----------------------------------------------| |--- LightRay_coop - * ray_state ----------------------------------------| |--- ray_state - * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| | - * kg (globals) -------------------------------------| | - * queuesize ----------------------------------------| | - * - * Note on Queues : - * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes - * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked - * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag. - * - * State of queues when this kernel is called : - * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same - * before and after this kernel call. - * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this - * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty. + * State of queues when this kernel is called: + * - State of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same before and after this + * kernel call. + * - QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a + * shadow_blocked function must be executed, after this kernel call + * Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty. */ -ccl_device char kernel_direct_lighting( - KernelGlobals *kg, - ShaderData *sd, /* Required for direct lighting */ - ccl_global uint *rng_coop, /* Required for direct lighting */ - ccl_global PathState *PathState_coop, /* Required for direct lighting */ - ccl_global int *ISLamp_coop, /* Required for direct lighting */ - ccl_global Ray *LightRay_coop, /* Required for direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int ray_index) +ccl_device void kernel_direct_lighting(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) { + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + char enqueue_flag = 0; - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global PathState *state = &PathState_coop[ray_index]; + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; /* direct lighting */ #ifdef __EMISSION__ - if((kernel_data.integrator.use_direct_light && - (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) - { + RNG rng = kernel_split_state.rng[ray_index]; + bool flag = (kernel_data.integrator.use_direct_light && + (sd->flag & SD_BSDF_HAS_EVAL)); +# ifdef __SHADOW_TRICKS__ + if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) { + flag = false; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + float3 throughput = kernel_split_state.throughput[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + kernel_branched_path_surface_connect_light(kg, + &rng, + sd, + emission_sd, + state, + throughput, + 1.0f, + L, + 1); + } +# endif /* __SHADOW_TRICKS__ */ + if(flag) { /* Sample illumination from lights to find path contribution. */ - ccl_global RNG* rng = &rng_coop[ray_index]; - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_state_rng_light_termination(kg, rng, state); + path_state_rng_2D(kg, &rng, state, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_state_rng_light_termination(kg, &rng, state); LightSample ls; if(light_sample(kg, light_t, light_u, light_v, - ccl_fetch(sd, time), - ccl_fetch(sd, P), + sd->time, + sd->P, state->bounce, &ls)) { Ray light_ray; -#ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); -#endif +# ifdef __OBJECT_MOTION__ + light_ray.time = sd->time; +# endif BsdfEval L_light; bool is_lamp; - if(direct_emission(kg, sd, kg->sd_input, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { + if(direct_emission(kg, sd, &kernel_split_state.sd_DL_shadow[ray_index], &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* Write intermediate data to global memory to access from * the next kernel. */ - LightRay_coop[ray_index] = light_ray; - BSDFEval_coop[ray_index] = L_light; - ISLamp_coop[ray_index] = is_lamp; + kernel_split_state.light_ray[ray_index] = light_ray; + kernel_split_state.bsdf_eval[ray_index] = L_light; + kernel_split_state.is_lamp[ray_index] = is_lamp; /* Mark ray state for next shadow kernel. */ - ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); enqueue_flag = 1; } } } + kernel_split_state.rng[ray_index] = rng; #endif /* __EMISSION__ */ } - return enqueue_flag; + +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + +#ifdef __EMISSION__ + /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_SHADOW_RAY_CAST_DL_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +#endif } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h new file mode 100644 index 00000000000..47d3c280831 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_do_volume.h @@ -0,0 +1,95 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + + +ccl_device void kernel_do_volume(KernelGlobals *kg) +{ +#ifdef __VOLUME__ + /* We will empty this queue in this kernel. */ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + } + /* Fetch use_queues_flag. */ + char local_use_queues_flag = *kernel_split_params.use_queues_flag; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(local_use_queues_flag) { + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } + + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) { + + bool hit = ! IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND); + + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + ccl_global Intersection *isect = &kernel_split_state.isect[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ShaderData *sd_input = &kernel_split_state.sd_DL_shadow[ray_index]; + + /* Sanitize volume stack. */ + if(!hit) { + kernel_volume_clean_stack(kg, state->volume_stack); + } + /* volume attenuation, emission, scatter */ + if(state->volume_stack[0].shader != SHADER_NONE) { + Ray volume_ray = *ray; + volume_ray.t = (hit)? isect->t: FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); + + { + /* integrate along volume segment with distance sampling */ + VolumeIntegrateResult result = kernel_volume_integrate( + kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous); + +# ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, &rng, sd, sd_input, *throughput, state, L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED); + else + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER); + } +# endif + } + } + kernel_split_state.rng[ray_index] = rng; + } + +#endif +} + + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index 5d951b972ed..9fc853a84bf 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -14,157 +14,159 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel. - * This is the sixth kernel in the ray tracing logic. This is the fifth - * of the path iteration kernels. This kernel takes care of the logic to process - * "material of type holdout", indirect primitive emission, bsdf blurring, - * probabilistic path termination and AO. +/* This kernel takes care of the logic to process "material of type holdout", + * indirect primitive emission, bsdf blurring, probabilistic path termination + * and AO. * - * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed. - * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and - * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS + * This kernels determines the rays for which a shadow_blocked() function + * associated with AO should be executed. Those rays for which a + * shadow_blocked() function for AO must be executed are marked with flag + * RAY_SHADOW_RAY_CAST_ao and enqueued into the queue + * QUEUE_SHADOW_RAY_CAST_AO_RAYS * * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER * - * The input and output are as follows, + * Note on Queues: + * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS + * and processes only the rays of state RAY_ACTIVE. + * There are different points in this kernel where a ray may terminate and + * reach RAY_UPDATE_BUFFER state. These rays are enqueued into + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present + * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has + * been changed to RAY_UPDATE_BUFFER, there is no problem. * - * rng_coop ---------------------------------------------|--- kernel_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * throughput_coop --------------------------------------| |--- PathState_coop - * PathRadiance_coop ------------------------------------| |--- throughput_coop - * Intersection_coop ------------------------------------| |--- L_transparent_coop - * PathState_coop ---------------------------------------| |--- per_sample_output_buffers - * L_transparent_coop -----------------------------------| |--- PathRadiance_coop - * sd ---------------------------------------------------| |--- ShaderData - * ray_state --------------------------------------------| |--- ray_state - * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- AOAlpha_coop - * kg (globals) -----------------------------------------| |--- AOBSDF_coop - * parallel_samples -------------------------------------| |--- AOLightRay_coop - * per_sample_output_buffers ----------------------------| | - * sw ---------------------------------------------------| | - * sh ---------------------------------------------------| | - * sx ---------------------------------------------------| | - * sy ---------------------------------------------------| | - * stride -----------------------------------------------| | - * work_array -------------------------------------------| | - * queuesize --------------------------------------------| | - * start_sample -----------------------------------------| | - * - * Note on Queues : - * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only - * the rays of state RAY_ACTIVE. - * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER - * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will - * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been - * changed to RAY_UPDATE_BUFFER, there is no problem. - * - * State of queues when this kernel is called : + * State of queues when this kernel is called: * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays. - * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty. + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and + * RAY_REGENERATED rays + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_TO_REGENERATE rays. + * - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty. * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays - * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, + * RAY_REGENERATED and RAY_UPDATE_BUFFER rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. + * - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with + * flag RAY_SHADOW_RAY_CAST_AO */ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( KernelGlobals *kg, - ShaderData *sd, /* Required throughout the kernel except probabilistic path termination and AO */ - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */ - ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */ - ccl_global float *L_transparent_coop, /* Required for handling holdout material */ - PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */ - ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */ - Intersection *Intersection_coop, /* Required for indirect primitive emission */ - ccl_global float3 *AOAlpha_coop, /* Required for AO */ - ccl_global float3 *AOBSDF_coop, /* Required for AO */ - ccl_global Ray *AOLightRay_coop, /* Required for AO */ - int sw, int sh, int sx, int sy, int stride, - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */ -#ifdef __WORK_STEALING__ - unsigned int start_sample, -#endif - int parallel_samples, /* Number of samples to be processed in parallel */ - int ray_index, - char *enqueue_flag, - char *enqueue_flag_AO_SHADOW_RAY_CAST) + ccl_local_param BackgroundAOLocals *locals) { -#ifdef __WORK_STEALING__ - unsigned int my_work; + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + locals->queue_atomics_bg = 0; + locals->queue_atomics_ao = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + char enqueue_flag = 0; + char enqueue_flag_AO_SHADOW_RAY_CAST = 0; + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif /* __COMPUTE_DEVICE_GPU__ */ + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + + int stride = kernel_split_params.stride; + + unsigned int work_index; unsigned int pixel_x; unsigned int pixel_y; -#endif + unsigned int tile_x; unsigned int tile_y; - int my_sample_tile; unsigned int sample; - ccl_global RNG *rng = 0x0; + RNG rng = kernel_split_state.rng[ray_index]; ccl_global PathState *state = 0x0; float3 throughput; + ccl_global char *ray_state = kernel_split_state.ray_state; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ccl_global float *buffer = kernel_split_params.buffer; + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - throughput = throughput_coop[ray_index]; - state = &PathState_coop[ray_index]; - rng = &rng_coop[ray_index]; -#ifdef __WORK_STEALING__ - my_work = work_array[ray_index]; - sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - get_pixel_tile_position(&pixel_x, &pixel_y, + throughput = kernel_split_state.throughput[ray_index]; + state = &kernel_split_state.path_state[ray_index]; + + work_index = kernel_split_state.work_array[ray_index]; + sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; + get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, - my_work, - sw, sh, sx, sy, - parallel_samples, + work_index, ray_index); - my_sample_tile = 0; -#else /* __WORK_STEALING__ */ - sample = work_array[ray_index]; - /* Buffer's stride is "stride"; Find x and y using ray_index. */ - int tile_index = ray_index / parallel_samples; - tile_x = tile_index % sw; - tile_y = tile_index / sw; - my_sample_tile = ray_index - (tile_index * parallel_samples); -#endif /* __WORK_STEALING__ */ - per_sample_output_buffers += - (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * - kernel_data.film.pass_stride; + + buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride; + +#ifdef __SHADOW_TRICKS__ + if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) { + if (state->flag & PATH_RAY_CAMERA) { + state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY); + state->catcher_object = sd->object; + if(!kernel_data.background.transparent) { + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + L->shadow_color = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray); + } + } + } + else { + state->flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY; + } +#endif /* __SHADOW_TRICKS__ */ /* holdout */ #ifdef __HOLDOUT__ - if(((ccl_fetch(sd, flag) & SD_HOLDOUT) || - (ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK)) && + if(((sd->flag & SD_HOLDOUT) || + (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) && (state->flag & PATH_RAY_CAMERA)) { if(kernel_data.background.transparent) { float3 holdout_weight; - if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) { + if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { holdout_weight = make_float3(1.0f, 1.0f, 1.0f); } else { holdout_weight = shader_holdout_eval(kg, sd); } /* any throughput is ok, should all be identical here */ - L_transparent_coop[ray_index] += average(holdout_weight*throughput); + kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput); } - if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) { + if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - *enqueue_flag = 1; + enqueue_flag = 1; } } #endif /* __HOLDOUT__ */ } if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - PathRadiance *L = &PathRadiance_coop[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; /* Holdout mask objects do not write data passes. */ kernel_write_data_passes(kg, - per_sample_output_buffers, + buffer, L, sd, sample, @@ -183,12 +185,12 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( #ifdef __EMISSION__ /* emission */ - if(ccl_fetch(sd, flag) & SD_EMISSION) { + if(sd->flag & SD_EMISSION) { /* TODO(sergey): is isect.t wrong here for transparent surfaces? */ float3 emission = indirect_primitive_emission( kg, sd, - Intersection_coop[ray_index].t, + kernel_split_state.isect[ray_index].t, state->flag, state->ray_pdf); path_radiance_accum_emission(L, throughput, emission, state->bounce); @@ -203,18 +205,18 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( if(probability == 0.0f) { ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - *enqueue_flag = 1; + enqueue_flag = 1; } if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE); + float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE); if(terminate >= probability) { ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - *enqueue_flag = 1; + enqueue_flag = 1; } else { - throughput_coop[ray_index] = throughput/probability; + kernel_split_state.throughput[ray_index] = throughput/probability; } } } @@ -224,37 +226,65 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || - (ccl_fetch(sd, flag) & SD_AO)) + (sd->flag & SD_AO)) { /* todo: solve correlation */ float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_state_rng_2D(kg, &rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); float ao_factor = kernel_data.background.ao_factor; float3 ao_N; - AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); - AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd); + kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); + kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd); float3 ao_D; float ao_pdf; sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { + if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { Ray _ray; - _ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); + _ray.P = ray_offset(sd->P, sd->Ng); _ray.D = ao_D; _ray.t = kernel_data.background.ao_distance; #ifdef __OBJECT_MOTION__ - _ray.time = ccl_fetch(sd, time); + _ray.time = sd->time; #endif - _ray.dP = ccl_fetch(sd, dP); + _ray.dP = sd->dP; _ray.dD = differential3_zero(); - AOLightRay_coop[ray_index] = _ray; + kernel_split_state.ao_light_ray[ray_index] = _ray; ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); - *enqueue_flag_AO_SHADOW_RAY_CAST = 1; + enqueue_flag_AO_SHADOW_RAY_CAST = 1; } } } #endif /* __AO__ */ + kernel_split_state.rng[ray_index] = rng; + + +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_UPDATE_BUFFER rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + &locals->queue_atomics_bg, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +#ifdef __AO__ + /* Enqueue to-shadow-ray-cast rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_SHADOW_RAY_CAST_AO_RAYS, + enqueue_flag_AO_SHADOW_RAY_CAST, + kernel_split_params.queue_size, + &locals->queue_atomics_ao, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +#endif } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h new file mode 100644 index 00000000000..8192528622e --- /dev/null +++ b/intern/cycles/kernel/split/kernel_indirect_background.h @@ -0,0 +1,82 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_indirect_background(KernelGlobals *kg) +{ + ccl_global char *ray_state = kernel_split_state.ray_state; + + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + int ray_index; + + if(kernel_data.integrator.ao_bounces) { + ray_index = get_ray_index(kg, thread_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index != QUEUE_EMPTY_SLOT) { + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + if(state->bounce > kernel_data.integrator.ao_bounces) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } + } + } + } + + ray_index = get_ray_index(kg, thread_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index]; + + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + /* eval background shader if nothing hit */ + if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) { + *L_transparent = (*L_transparent) + average((*throughput)); +#ifdef __PASSES__ + if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) +#endif + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } + + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { +#ifdef __BACKGROUND__ + /* sample background shader */ + float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray); + path_radiance_accum_background(L, state, (*throughput), L_background); +#endif + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } + } + + +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h new file mode 100644 index 00000000000..a56e85abeb9 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h @@ -0,0 +1,77 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_indirect_subsurface(KernelGlobals *kg) +{ + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(thread_index == 0) { + /* We will empty both queues in this kernel. */ + kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + } + + int ray_index; + get_ray_index(kg, thread_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + ray_index = get_ray_index(kg, thread_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + +#ifdef __SUBSURFACE__ + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + ccl_global char *ray_state = kernel_split_state.ray_state; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + + if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { + ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; + kernel_path_subsurface_accum_indirect(ss_indirect, L); + + /* Trace indirect subsurface rays by restarting the loop. this uses less + * stack memory than invoking kernel_path_indirect. + */ + if(ss_indirect->num_rays) { + kernel_path_subsurface_setup_indirect(kg, + ss_indirect, + state, + ray, + L, + throughput); + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + else { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } + } + +#endif /* __SUBSURFACE__ */ + +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h index 3bd0e361078..c669d79ddcd 100644 --- a/intern/cycles/kernel/split/kernel_lamp_emission.h +++ b/intern/cycles/kernel/split/kernel_lamp_emission.h @@ -14,50 +14,49 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_lamp_emission - * This is the 3rd kernel in the ray-tracing logic. This is the second of the - * path-iteration kernels. This kernel takes care of the indirect lamp emission logic. - * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE - * and RAY_HIT_BACKGROUND. +/* This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * It processes rays of state RAY_ACTIVE and RAY_HIT_BACKGROUND. * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel. - * The input/output of the kernel is as follows, - * Throughput_coop ------------------------------------|--- kernel_lamp_emission --|--- PathRadiance_coop - * Ray_coop -------------------------------------------| |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS) - * PathState_coop -------------------------------------| |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) - * kg (globals) ---------------------------------------| | - * Intersection_coop ----------------------------------| | - * ray_state ------------------------------------------| | - * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----| | - * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----| | - * queuesize ------------------------------------------| | - * use_queues_flag ------------------------------------| | - * sw -------------------------------------------------| | - * sh -------------------------------------------------| | */ -ccl_device void kernel_lamp_emission( - KernelGlobals *kg, - ccl_global float3 *throughput_coop, /* Required for lamp emission */ - PathRadiance *PathRadiance_coop, /* Required for lamp emission */ - ccl_global Ray *Ray_coop, /* Required for lamp emission */ - ccl_global PathState *PathState_coop, /* Required for lamp emission */ - Intersection *Intersection_coop, /* Required for lamp emission */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global char *use_queues_flag, /* Used to decide if this kernel should use - * queues to fetch ray index - */ - int ray_index) +ccl_device void kernel_lamp_emission(KernelGlobals *kg) { - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || - IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) +#ifndef __VOLUME__ + /* We will empty this queue in this kernel. */ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + } +#endif + /* Fetch use_queues_flag. */ + char local_use_queues_flag = *kernel_split_params.use_queues_flag; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(local_use_queues_flag) { + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, +#ifndef __VOLUME__ + 1 +#else + 0 +#endif + ); + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } + + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) { - PathRadiance *L = &PathRadiance_coop[ray_index]; - ccl_global PathState *state = &PathState_coop[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - float3 throughput = throughput_coop[ray_index]; - Ray ray = Ray_coop[ray_index]; + float3 throughput = kernel_split_state.throughput[ray_index]; + Ray ray = kernel_split_state.ray[ray_index]; #ifdef __LAMP_MIS__ if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) { @@ -65,7 +64,7 @@ ccl_device void kernel_lamp_emission( Ray light_ray; light_ray.P = ray.P - state->ray_t*ray.D; - state->ray_t += Intersection_coop[ray_index].t; + state->ray_t += kernel_split_state.isect[ray_index].t; light_ray.D = ray.D; light_ray.t = state->ray_t; light_ray.time = ray.time; @@ -74,10 +73,13 @@ ccl_device void kernel_lamp_emission( /* intersect with lamp */ float3 emission; - if(indirect_lamp_emission(kg, kg->sd_input, state, &light_ray, &emission)) { + if(indirect_lamp_emission(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &light_ray, &emission)) { path_radiance_accum_emission(L, throughput, emission, state->bounce); } } #endif /* __LAMP_MIS__ */ } } + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h index 816f3a6fbff..1bebc16e25b 100644 --- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h +++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h @@ -14,128 +14,163 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_setup_next_iteration kernel. - * This is the tenth kernel in the ray tracing logic. This is the ninth - * of the path iteration kernels. This kernel takes care of setting up - * Ray for the next iteration of path-iteration and accumulating radiance - * corresponding to AO and direct-lighting +/*This kernel takes care of setting up ray for the next iteration of + * path-iteration and accumulating radiance corresponding to AO and + * direct-lighting * - * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER + * Ray state of rays that are terminated in this kernel are changed + * to RAY_UPDATE_BUFFER. * - * The input and output are as follows, + * Note on queues: + * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS + * and processes only the rays of state RAY_ACTIVE. + * There are different points in this kernel where a ray may terminate and + * reach RAY_UPDATE_BUFF state. These rays are enqueued into + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present + * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has + * been changed to RAY_UPDATE_BUFF, there is no problem. * - * rng_coop ---------------------------------------------|--- kernel_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * throughput_coop --------------------------------------| |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) - * PathRadiance_coop ------------------------------------| |--- throughput_coop - * PathState_coop ---------------------------------------| |--- PathRadiance_coop - * sd ---------------------------------------------------| |--- PathState_coop - * ray_state --------------------------------------------| |--- ray_state - * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------| |--- Ray_coop - * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- use_queues_flag - * Ray_coop ---------------------------------------------| | - * kg (globals) -----------------------------------------| | - * LightRay_dl_coop -------------------------------------| - * ISLamp_coop ------------------------------------------| - * BSDFEval_coop ----------------------------------------| - * LightRay_ao_coop -------------------------------------| - * AOBSDF_coop ------------------------------------------| - * AOAlpha_coop -----------------------------------------| - * - * Note on queues, - * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only - * the rays of state RAY_ACTIVE. - * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF - * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will - * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been - * changed to RAY_UPDATE_BUFF, there is no problem. - * - * State of queues when this kernel is called : + * State of queues when this kernel is called: * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, + * RAY_REGENERATED, RAY_UPDATE_BUFFER rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, + * RAY_REGENERATED and more RAY_UPDATE_BUFFER rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays. */ -ccl_device char kernel_next_iteration_setup( - KernelGlobals *kg, - ShaderData *sd, /* Required for setting up ray for next iteration */ - ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */ - ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */ - PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */ - ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */ - ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */ - ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global char *use_queues_flag, /* flag to decide if scene_intersect kernel should - * use queues to fetch ray index */ - int ray_index) +ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) { + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + /* If we are here, then it means that scene-intersect kernel + * has already been executed atleast once. From the next time, + * scene-intersect kernel may operate on queues to fetch ray index + */ + *kernel_split_params.use_queues_flag = 1; + + /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and + * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the + * previous kernel. + */ + kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; + kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + } + char enqueue_flag = 0; + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif /* Load ShaderData structure. */ PathRadiance *L = NULL; ccl_global PathState *state = NULL; + ccl_global char *ray_state = kernel_split_state.ray_state; /* Path radiance update for AO/Direct_lighting's shadow blocked. */ if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - state = &PathState_coop[ray_index]; - L = &PathRadiance_coop[ray_index]; - float3 _throughput = throughput_coop[ray_index]; + state = &kernel_split_state.path_state[ray_index]; + L = &kernel_split_state.path_radiance[ray_index]; + float3 _throughput = kernel_split_state.throughput[ray_index]; if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - float3 shadow = LightRay_ao_coop[ray_index].P; - char update_path_radiance = LightRay_ao_coop[ray_index].t; + float3 shadow = kernel_split_state.ao_light_ray[ray_index].P; + // TODO(mai): investigate correctness here + char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t; if(update_path_radiance) { path_radiance_accum_ao(L, _throughput, - AOAlpha_coop[ray_index], - AOBSDF_coop[ray_index], + kernel_split_state.ao_alpha[ray_index], + kernel_split_state.ao_bsdf[ray_index], shadow, state->bounce); } + else { + path_radiance_accum_total_ao(L, _throughput, kernel_split_state.ao_bsdf[ray_index]); + } REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); } if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { - float3 shadow = LightRay_dl_coop[ray_index].P; - char update_path_radiance = LightRay_dl_coop[ray_index].t; + float3 shadow = kernel_split_state.light_ray[ray_index].P; + // TODO(mai): investigate correctness here + char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t; + BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index]; if(update_path_radiance) { - BsdfEval L_light = BSDFEval_coop[ray_index]; path_radiance_accum_light(L, _throughput, &L_light, shadow, 1.0f, state->bounce, - ISLamp_coop[ray_index]); + kernel_split_state.is_lamp[ray_index]); + } + else { + path_radiance_accum_total_light(L, _throughput, &L_light); } REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); } } if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global float3 *throughput = &throughput_coop[ray_index]; - ccl_global Ray *ray = &Ray_coop[ray_index]; - ccl_global RNG *rng = &rng_coop[ray_index]; - state = &PathState_coop[ray_index]; - L = &PathRadiance_coop[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + state = &kernel_split_state.path_state[ray_index]; + L = &kernel_split_state.path_radiance[ray_index]; /* Compute direct lighting and next bounce. */ - if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) { + if(!kernel_path_surface_bounce(kg, &rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) { ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); enqueue_flag = 1; } + kernel_split_state.rng[ray_index] = rng; } - return enqueue_flag; +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_UPDATE_BUFFER rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h new file mode 100644 index 00000000000..a7ecde7c80d --- /dev/null +++ b/intern/cycles/kernel/split/kernel_path_init.h @@ -0,0 +1,105 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* This kernel initializes structures needed in path-iteration kernels. + * This is the first kernel in ray-tracing logic. + * + * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE + */ +ccl_device void kernel_path_init(KernelGlobals *kg) { + int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0); + + /* This is the first assignment to ray_state; + * So we dont use ASSIGN_RAY_STATE macro. + */ + kernel_split_state.ray_state[ray_index] = RAY_ACTIVE; + + unsigned int my_sample; + unsigned int pixel_x; + unsigned int pixel_y; + unsigned int tile_x; + unsigned int tile_y; + + unsigned int work_index = 0; + /* Get work. */ + if(!get_next_work(kg, &work_index, ray_index)) { + /* No more work, mark ray as inactive */ + kernel_split_state.ray_state[ray_index] = RAY_INACTIVE; + + return; + } + + /* Get the sample associated with the work. */ + my_sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; + + /* Get pixel and tile position associated with the work. */ + get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, + &tile_x, &tile_y, + work_index, + ray_index); + kernel_split_state.work_array[ray_index] = work_index; + + ccl_global uint *rng_state = kernel_split_params.rng_state; + rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride; + + ccl_global float *buffer = kernel_split_params.buffer; + buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride; + + RNG rng = kernel_split_state.rng[ray_index]; + + /* Initialize random numbers and ray. */ + kernel_path_trace_setup(kg, + rng_state, + my_sample, + pixel_x, pixel_y, + &rng, + &kernel_split_state.ray[ray_index]); + + if(kernel_split_state.ray[ray_index].t != 0.0f) { + /* Initialize throughput, L_transparent, Ray, PathState; + * These rays proceed with path-iteration. + */ + kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f); + kernel_split_state.L_transparent[ray_index] = 0.0f; + path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass); + path_state_init(kg, + &kernel_split_state.sd_DL_shadow[ray_index], + &kernel_split_state.path_state[ray_index], + &rng, + my_sample, + &kernel_split_state.ray[ray_index]); +#ifdef __SUBSURFACE__ + kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]); +#endif + +#ifdef __KERNEL_DEBUG__ + debug_data_init(&kernel_split_state.debug_data[ray_index]); +#endif + } + else { + /* These rays do not participate in path-iteration. */ + float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + /* Accumulate result in output buffer. */ + kernel_write_pass_float4(buffer, my_sample, L_rad); + path_rng_end(kg, rng_state, kernel_split_state.rng[ray_index]); + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE); + } + kernel_split_state.rng[ray_index] = rng; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h new file mode 100644 index 00000000000..e2e841f36d3 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h @@ -0,0 +1,90 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* This kernel enqueues rays of different ray state into their + * appropriate queues: + * + * 1. Rays that have been determined to hit the background from the + * "kernel_scene_intersect" kernel are enqueued in + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + * 2. Rays that have been determined to be actively participating in pat + * -iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * + * State of queue during other times this kernel is called: + * At entry, + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE + * and RAY_UPDATE_BUFFER rays. + * At exit, + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays. + */ +ccl_device void kernel_queue_enqueue(KernelGlobals *kg, + ccl_local_param QueueEnqueueLocals *locals) +{ + /* We have only 2 cases (Hit/Not-Hit) */ + int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + + if(lidx == 0) { + locals->queue_atomics[0] = 0; + locals->queue_atomics[1] = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int queue_number = -1; + + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER)) { + queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + } + else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) { + queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS; + } + + unsigned int my_lqidx; + if(queue_number != -1) { + my_lqidx = get_local_queue_index(queue_number, locals->queue_atomics); + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + if(lidx == 0) { + locals->queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = + get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, + locals->queue_atomics, + kernel_split_params.queue_index); + locals->queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = + get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + locals->queue_atomics, + kernel_split_params.queue_index); + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + unsigned int my_gqidx; + if(queue_number != -1) { + my_gqidx = get_global_queue_index(queue_number, + kernel_split_params.queue_size, + my_lqidx, + locals->queue_atomics); + kernel_split_state.queue_data[my_gqidx] = ray_index; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h index 2388580051f..684760eedee 100644 --- a/intern/cycles/kernel/split/kernel_scene_intersect.h +++ b/intern/cycles/kernel/split/kernel_scene_intersect.h @@ -14,81 +14,47 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_scene_intersect kernel. - * This is the second kernel in the ray tracing logic. This is the first - * of the path iteration kernels. This kernel takes care of scene_intersect function. +/* This kernel takes care of scene_intersect function. * * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE. * This kernel processes rays of ray state RAY_ACTIVE - * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND. - * - * The input and output are as follows, - * - * Ray_coop ---------------------------------------|--------- kernel_scene_intersect----------|--- PathState - * PathState_coop ---------------------------------| |--- Intersection - * ray_state --------------------------------------| |--- ray_state - * use_queues_flag --------------------------------| | - * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| | - * kg (globals) -----------------------------------| | - * rng_coop ---------------------------------------| | - * sw ---------------------------------------------| | - * sh ---------------------------------------------| | - * queuesize --------------------------------------| | - * - * Note on Queues : - * Ideally we would want kernel_scene_intersect to work on queues. - * But during the very first time, the queues will be empty and hence we perform a direct mapping - * between ray-index and thread-index; From the next time onward, the queue will be filled and - * we may start operating on queues. - * - * State of queue during the first time this kernel is called : - * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel - * - * State of queues during other times this kernel is called : - * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays; - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ; - * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These - * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing - * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from - * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays - * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues) - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and - * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND - * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change + * This kernel determines the rays that have hit the background and changes + * their ray state to RAY_HIT_BACKGROUND. */ - -ccl_device void kernel_scene_intersect( - KernelGlobals *kg, - ccl_global uint *rng_coop, - ccl_global Ray *Ray_coop, /* Required for scene_intersect */ - ccl_global PathState *PathState_coop, /* Required for scene_intersect */ - Intersection *Intersection_coop, /* Required for scene_intersect */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global char *use_queues_flag, /* used to decide if this kernel should use - * queues to fetch ray index */ -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int ray_index) +ccl_device void kernel_scene_intersect(KernelGlobals *kg) { + /* Fetch use_queues_flag */ + char local_use_queues_flag = *kernel_split_params.use_queues_flag; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(local_use_queues_flag) { + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } + /* All regenerated rays become active here */ - if(IS_STATE(ray_state, ray_index, RAY_REGENERATED)) - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE); + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); - if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE)) + if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) return; #ifdef __KERNEL_DEBUG__ - DebugData *debug_data = &debugdata_coop[ray_index]; + DebugData *debug_data = &kernel_split_state.debug_data[ray_index]; #endif - Intersection *isect = &Intersection_coop[ray_index]; - PathState state = PathState_coop[ray_index]; - Ray ray = Ray_coop[ray_index]; + Intersection isect; + PathState state = kernel_split_state.path_state[ray_index]; + Ray ray = kernel_split_state.ray[ray_index]; /* intersect scene */ uint visibility = path_state_ray_visibility(kg, &state); @@ -96,7 +62,7 @@ ccl_device void kernel_scene_intersect( #ifdef __HAIR__ float difl = 0.0f, extmax = 0.0f; uint lcg_state = 0; - RNG rng = rng_coop[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; if(kernel_data.bvh.have_curves) { if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) { @@ -106,19 +72,25 @@ ccl_device void kernel_scene_intersect( } extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(&rng, &state, 0x51633e2d); + lcg_state = lcg_state_init(&rng, state.rng_offset, state.sample, 0x51633e2d); + } + + if(state.bounce > kernel_data.integrator.ao_bounces) { + visibility = PATH_RAY_SHADOW; + ray.t = kernel_data.background.ao_distance; } - bool hit = scene_intersect(kg, ray, visibility, isect, &lcg_state, difl, extmax); + bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax); #else - bool hit = scene_intersect(kg, ray, visibility, isect, NULL, 0.0f, 0.0f); + bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f); #endif + kernel_split_state.isect[ray_index] = isect; #ifdef __KERNEL_DEBUG__ if(state.flag & PATH_RAY_CAMERA) { - debug_data->num_bvh_traversed_nodes += isect->num_traversed_nodes; - debug_data->num_bvh_traversed_instances += isect->num_traversed_instances; - debug_data->num_bvh_intersections += isect->num_intersections; + debug_data->num_bvh_traversed_nodes += isect.num_traversed_nodes; + debug_data->num_bvh_traversed_instances += isect.num_traversed_instances; + debug_data->num_bvh_intersections += isect.num_intersections; } debug_data->num_ray_bounces++; #endif @@ -128,6 +100,8 @@ ccl_device void kernel_scene_intersect( * These rays undergo special processing in the * background_bufferUpdate kernel. */ - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND); + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND); } } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h index cef64bf5f36..0f1696e34a0 100644 --- a/intern/cycles/kernel/split/kernel_shader_eval.h +++ b/intern/cycles/kernel/split/kernel_shader_eval.h @@ -14,57 +14,58 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_shader_eval kernel - * This kernel is the 5th kernel in the ray tracing logic. This is - * the 4rd kernel in path iteration. This kernel sets up the ShaderData - * structure from the values computed by the previous kernels. It also identifies - * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. +/* This kernel sets up the ShaderData structure from the values computed + * by the previous kernels. * - * The input and output of the kernel is as follows, - * rng_coop -------------------------------------------|--- kernel_shader_eval --|--- sd - * Ray_coop -------------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * PathState_coop -------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * Intersection_coop ----------------------------------| | - * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------| | - * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---| | - * ray_state ------------------------------------------| | - * kg (globals) ---------------------------------------| | - * queuesize ------------------------------------------| | - * - * Note on Queues : - * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes - * only the rays of state RAY_ACTIVE; - * State of queues when this kernel is called, - * at entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. - * at exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays + * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them + * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ -ccl_device void kernel_shader_eval( - KernelGlobals *kg, - ShaderData *sd, /* Output ShaderData structure to be filled */ - ccl_global uint *rng_coop, /* Required for rbsdf calculation */ - ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */ - ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */ - Intersection *Intersection_coop, /* Required for setting up shader from ray */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int ray_index) +ccl_device void kernel_shader_eval(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) { - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - Intersection *isect = &Intersection_coop[ray_index]; - ccl_global uint *rng = &rng_coop[ray_index]; - ccl_global PathState *state = &PathState_coop[ray_index]; - Ray ray = Ray_coop[ray_index]; + /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + char enqueue_flag = 0; + if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) { + enqueue_flag = 1; + } + + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + + /* Continue on with shader evaluation. */ + if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { + Intersection isect = kernel_split_state.isect[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + Ray ray = kernel_split_state.ray[ray_index]; shader_setup_from_ray(kg, - sd, - isect, + &kernel_split_state.sd[ray_index], + &isect, &ray); - float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF); - shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN); + float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF); + shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN); + kernel_split_state.rng[ray_index] = rng; } } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h deleted file mode 100644 index 6153af47f96..00000000000 --- a/intern/cycles/kernel/split/kernel_shadow_blocked.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel_split_common.h" - -/* Note on kernel_shadow_blocked kernel. - * This is the ninth kernel in the ray tracing logic. This is the eighth - * of the path iteration kernels. This kernel takes care of "shadow ray cast" - * logic of the direct lighting and AO part of ray tracing. - * - * The input and output are as follows, - * - * PathState_coop ----------------------------------|--- kernel_shadow_blocked --| - * LightRay_dl_coop --------------------------------| |--- LightRay_dl_coop - * LightRay_ao_coop --------------------------------| |--- LightRay_ao_coop - * ray_state ---------------------------------------| |--- ray_state - * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS & | |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS) - QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | - * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS& - QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | - * kg (globals) ------------------------------------| | - * queuesize ---------------------------------------| | - * - * Note on sd_shadow : sd_shadow is neither input nor output to this kernel. sd_shadow is filled and consumed in this kernel itself. - * Note on queues : - * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty - * these queues this kernel. - * State of queues when this kernel is called : - * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same - * before and after this kernel call. - * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO - * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry. - * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit. - */ -ccl_device void kernel_shadow_blocked( - KernelGlobals *kg, - ccl_global PathState *PathState_coop, /* Required for shadow blocked */ - ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */ - ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */ - ccl_global char *ray_state, - char shadow_blocked_type, - int ray_index) -{ - /* Flag determining if we need to update L. */ - char update_path_radiance = 0; - - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || - IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) - { - ccl_global PathState *state = &PathState_coop[ray_index]; - ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index]; - ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index]; - - ccl_global Ray *light_ray_global = - shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO - ? light_ray_ao_global - : light_ray_dl_global; - - float3 shadow; - update_path_radiance = !(shadow_blocked(kg, - kg->sd_input, - state, - light_ray_global, - &shadow)); - - /* We use light_ray_global's P and t to store shadow and - * update_path_radiance. - */ - light_ray_global->P = shadow; - light_ray_global->t = update_path_radiance; - } -} diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h new file mode 100644 index 00000000000..4243e18de72 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h @@ -0,0 +1,60 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* Shadow ray cast for AO. */ +ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg) +{ + unsigned int ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS]; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = QUEUE_EMPTY_SLOT; + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(thread_index < ao_queue_length) { + ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, + kernel_split_state.queue_data, kernel_split_params.queue_size, 1); + } + + if(ray_index == QUEUE_EMPTY_SLOT) + return; + + /* Flag determining if we need to update L. */ + char update_path_radiance = 0; + + if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + ccl_global Ray *light_ray_global = &kernel_split_state.ao_light_ray[ray_index]; + + float3 shadow; + Ray ray = *light_ray_global; + update_path_radiance = !(shadow_blocked(kg, + &kernel_split_state.sd_DL_shadow[ray_index], + state, + &ray, + &shadow)); + + *light_ray_global = ray; + /* We use light_ray_global's P and t to store shadow and + * update_path_radiance. + */ + light_ray_global->P = shadow; + light_ray_global->t = update_path_radiance; + } +} + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h new file mode 100644 index 00000000000..bb8f0157965 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h @@ -0,0 +1,59 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* Shadow ray cast for direct visible light. */ +ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) +{ + unsigned int dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS]; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = QUEUE_EMPTY_SLOT; + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(thread_index < dl_queue_length) { + ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS, + kernel_split_state.queue_data, kernel_split_params.queue_size, 1); + } + + if(ray_index == QUEUE_EMPTY_SLOT) + return; + + /* Flag determining if we need to update L. */ + char update_path_radiance = 0; + + if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + ccl_global Ray *light_ray_global = &kernel_split_state.light_ray[ray_index]; + + float3 shadow; + Ray ray = *light_ray_global; + update_path_radiance = !(shadow_blocked(kg, + &kernel_split_state.sd_DL_shadow[ray_index], + state, + &ray, + &shadow)); + + *light_ray_global = ray; + /* We use light_ray_global's P and t to store shadow and + * update_path_radiance. + */ + light_ray_global->P = shadow; + light_ray_global->t = update_path_radiance; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h index 2135ee22b2e..4303ba0a905 100644 --- a/intern/cycles/kernel/split/kernel_split_common.h +++ b/intern/cycles/kernel/split/kernel_split_common.h @@ -17,48 +17,61 @@ #ifndef __KERNEL_SPLIT_H__ #define __KERNEL_SPLIT_H__ -#include "kernel_compat_opencl.h" -#include "kernel_math.h" -#include "kernel_types.h" -#include "kernel_globals.h" -#include "kernel_image_opencl.h" +#include "kernel/kernel_math.h" +#include "kernel/kernel_types.h" -#include "util_atomic.h" +#include "kernel/split/kernel_split_data.h" -#include "kernel_random.h" -#include "kernel_projection.h" -#include "kernel_montecarlo.h" -#include "kernel_differential.h" -#include "kernel_camera.h" +#include "kernel/kernel_globals.h" -#include "geom/geom.h" -#include "bvh/bvh.h" +#ifdef __OSL__ +# include "kernel/osl/osl_shader.h" +#endif + +#ifdef __KERNEL_OPENCL__ +# include "kernel/kernel_image_opencl.h" +#endif +#ifdef __KERNEL_CPU__ +# include "kernel/kernels/cpu/kernel_cpu_image.h" +#endif + +#include "util/util_atomic.h" + +#include "kernel/kernel_random.h" +#include "kernel/kernel_projection.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_differential.h" +#include "kernel/kernel_camera.h" + +#include "kernel/geom/geom.h" +#include "kernel/bvh/bvh.h" -#include "kernel_accumulate.h" -#include "kernel_shader.h" -#include "kernel_light.h" -#include "kernel_passes.h" +#include "kernel/kernel_accumulate.h" +#include "kernel/kernel_shader.h" +#include "kernel/kernel_light.h" +#include "kernel/kernel_passes.h" #ifdef __SUBSURFACE__ -#include "kernel_subsurface.h" +# include "kernel/kernel_subsurface.h" #endif #ifdef __VOLUME__ -#include "kernel_volume.h" +# include "kernel/kernel_volume.h" #endif -#include "kernel_path_state.h" -#include "kernel_shadow.h" -#include "kernel_emission.h" -#include "kernel_path_common.h" -#include "kernel_path_surface.h" -#include "kernel_path_volume.h" +#include "kernel/kernel_path_state.h" +#include "kernel/kernel_shadow.h" +#include "kernel/kernel_emission.h" +#include "kernel/kernel_path_common.h" +#include "kernel/kernel_path_surface.h" +#include "kernel/kernel_path_volume.h" +#include "kernel/kernel_path_subsurface.h" #ifdef __KERNEL_DEBUG__ -#include "kernel_debug.h" +# include "kernel/kernel_debug.h" #endif -#include "kernel_queues.h" -#include "kernel_work_stealing.h" +#include "kernel/kernel_queues.h" +#include "kernel/kernel_work_stealing.h" #endif /* __KERNEL_SPLIT_H__ */ diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h new file mode 100644 index 00000000000..17e6587883a --- /dev/null +++ b/intern/cycles/kernel/split/kernel_split_data.h @@ -0,0 +1,75 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_SPLIT_DATA_H__ +#define __KERNEL_SPLIT_DATA_H__ + +#include "kernel/split/kernel_split_data_types.h" +#include "kernel/kernel_globals.h" + +CCL_NAMESPACE_BEGIN + +ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements) +{ + (void)kg; /* Unused on CPU. */ + + uint64_t size = 0; +#define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16) + size = size SPLIT_DATA_ENTRIES; +#undef SPLIT_DATA_ENTRY + +#ifdef __SUBSURFACE__ + size += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16); /* ss_rays */ +#endif + +#ifdef __VOLUME__ + size += align_up(2 * num_elements * sizeof(PathState), 16); /* state_shadow */ +#endif + + return size; +} + +ccl_device_inline void split_data_init(KernelGlobals *kg, + ccl_global SplitData *split_data, + size_t num_elements, + ccl_global void *data, + ccl_global char *ray_state) +{ + (void)kg; /* Unused on CPU. */ + + ccl_global char *p = (ccl_global char*)data; + +#define SPLIT_DATA_ENTRY(type, name, num) \ + split_data->name = (type*)p; p += align_up(num_elements * num * sizeof(type), 16); + SPLIT_DATA_ENTRIES; +#undef SPLIT_DATA_ENTRY + +#ifdef __SUBSURFACE__ + split_data->ss_rays = (ccl_global SubsurfaceIndirectRays*)p; + p += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16); +#endif + +#ifdef __VOLUME__ + split_data->state_shadow = (ccl_global PathState*)p; + p += align_up(2 * num_elements * sizeof(PathState), 16); +#endif + + split_data->ray_state = ray_state; +} + +CCL_NAMESPACE_END + +#endif /* __KERNEL_SPLIT_DATA_H__ */ diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h new file mode 100644 index 00000000000..748197b7183 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_split_data_types.h @@ -0,0 +1,127 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_SPLIT_DATA_TYPES_H__ +#define __KERNEL_SPLIT_DATA_TYPES_H__ + +CCL_NAMESPACE_BEGIN + +/* parameters used by the split kernels, we use a single struct to avoid passing these to each kernel */ + +typedef struct SplitParams { + int x; + int y; + int w; + int h; + + int offset; + int stride; + + ccl_global uint *rng_state; + + int start_sample; + int end_sample; + + ccl_global unsigned int *work_pools; + unsigned int num_samples; + + ccl_global int *queue_index; + int queue_size; + ccl_global char *use_queues_flag; + + ccl_global float *buffer; +} SplitParams; + +/* Global memory variables [porting]; These memory is used for + * co-operation between different kernels; Data written by one + * kernel will be available to another kernel via this global + * memory. + */ + +/* SPLIT_DATA_ENTRY(type, name, num) */ + +#if defined(WITH_CYCLES_DEBUG) || defined(__KERNEL_DEBUG__) +/* DebugData memory */ +# define SPLIT_DATA_DEBUG_ENTRIES \ + SPLIT_DATA_ENTRY(DebugData, debug_data, 1) +#else +# define SPLIT_DATA_DEBUG_ENTRIES +#endif + +#define SPLIT_DATA_ENTRIES \ + SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \ + SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \ + SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \ + SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ + SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ + SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \ + SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \ + SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \ + SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \ + SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ + SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \ + SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \ + SPLIT_DATA_DEBUG_ENTRIES \ + +/* struct that holds pointers to data in the shared state buffer */ +typedef struct SplitData { +#define SPLIT_DATA_ENTRY(type, name, num) type *name; + SPLIT_DATA_ENTRIES +#undef SPLIT_DATA_ENTRY + +#ifdef __SUBSURFACE__ + ccl_global SubsurfaceIndirectRays *ss_rays; +#endif + +#ifdef __VOLUME__ + ccl_global PathState *state_shadow; +#endif + + /* this is actually in a separate buffer from the rest of the split state data (so it can be read back from + * the host easily) but is still used the same as the other data so we have it here in this struct as well + */ + ccl_global char *ray_state; +} SplitData; + +#ifndef __KERNEL_CUDA__ +# define kernel_split_state (kg->split_data) +# define kernel_split_params (kg->split_param_data) +#else +__device__ SplitData __split_data; +# define kernel_split_state (__split_data) +__device__ SplitParams __split_param_data; +# define kernel_split_params (__split_param_data) +#endif /* __KERNEL_CUDA__ */ + +/* Local storage for queue_enqueue kernel. */ +typedef struct QueueEnqueueLocals { + uint queue_atomics[2]; +} QueueEnqueueLocals; + +/* Local storage for holdout_emission_blurring_pathtermination_ao kernel. */ +typedef struct BackgroundAOLocals { + uint queue_atomics_bg; + uint queue_atomics_ao; +} BackgroundAOLocals; + +CCL_NAMESPACE_END + +#endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */ diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h new file mode 100644 index 00000000000..0b4d50c70ee --- /dev/null +++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h @@ -0,0 +1,100 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + + +ccl_device void kernel_subsurface_scatter(KernelGlobals *kg, + ccl_local_param unsigned int* local_queue_atomics) +{ +#ifdef __SUBSURFACE__ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif + + char enqueue_flag = 0; + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + + ccl_global char *ray_state = kernel_split_state.ray_state; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + if(sd->flag & SD_BSSRDF) { + if(kernel_path_subsurface_scatter(kg, + sd, + emission_sd, + L, + state, + &rng, + ray, + throughput, + ss_indirect)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + enqueue_flag = 1; + } + } + kernel_split_state.rng[ray_index] = rng; + } + +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_UPDATE_BUFFER rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +#endif /* __SUBSURFACE__ */ + +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h deleted file mode 100644 index a21e9b6a0b1..00000000000 --- a/intern/cycles/kernel/split/kernel_sum_all_radiance.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../kernel_compat_opencl.h" -#include "../kernel_math.h" -#include "../kernel_types.h" -#include "../kernel_globals.h" - -/* Since we process various samples in parallel; The output radiance of different samples - * are stored in different locations; This kernel combines the output radiance contributed - * by all different samples and stores them in the RenderTile's output buffer. - */ -ccl_device void kernel_sum_all_radiance( - ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */ - ccl_global float *buffer, /* Output buffer of RenderTile */ - ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */ - int parallel_samples, int sw, int sh, int stride, - int buffer_offset_x, - int buffer_offset_y, - int buffer_stride, - int start_sample) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if(x < sw && y < sh) { - buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride); - per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride); - - int sample_stride = (data->film.pass_stride); - - int sample_iterator = 0; - int pass_stride_iterator = 0; - int num_floats = data->film.pass_stride; - - for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) { - for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) { - *(buffer + pass_stride_iterator) = - (start_sample == 0 && sample_iterator == 0) - ? *(per_sample_output_buffer + pass_stride_iterator) - : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator); - } - per_sample_output_buffer += sample_stride; - } - } -} diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h index 88ec7fe6fcc..d748e76fa80 100644 --- a/intern/cycles/kernel/svm/svm.h +++ b/intern/cycles/kernel/svm/svm.h @@ -39,7 +39,7 @@ * mostly taken care of in the SVM compiler. */ -#include "svm_types.h" +#include "kernel/svm/svm_types.h" CCL_NAMESPACE_BEGIN @@ -139,49 +139,49 @@ CCL_NAMESPACE_END /* Nodes */ -#include "svm_noise.h" +#include "kernel/svm/svm_noise.h" #include "svm_texture.h" -#include "svm_color_util.h" -#include "svm_math_util.h" - -#include "svm_attribute.h" -#include "svm_gradient.h" -#include "svm_blackbody.h" -#include "svm_closure.h" -#include "svm_noisetex.h" -#include "svm_convert.h" -#include "svm_displace.h" -#include "svm_fresnel.h" -#include "svm_wireframe.h" -#include "svm_wavelength.h" -#include "svm_camera.h" -#include "svm_geometry.h" -#include "svm_hsv.h" -#include "svm_image.h" -#include "svm_gamma.h" -#include "svm_brightness.h" -#include "svm_invert.h" -#include "svm_light_path.h" -#include "svm_magic.h" -#include "svm_mapping.h" -#include "svm_normal.h" -#include "svm_wave.h" -#include "svm_math.h" -#include "svm_mix.h" -#include "svm_ramp.h" -#include "svm_sepcomb_hsv.h" -#include "svm_sepcomb_vector.h" -#include "svm_musgrave.h" -#include "svm_sky.h" -#include "svm_tex_coord.h" -#include "svm_value.h" -#include "svm_voronoi.h" -#include "svm_checker.h" -#include "svm_brick.h" -#include "svm_vector_transform.h" -#include "svm_voxel.h" -#include "svm_bump.h" +#include "kernel/svm/svm_color_util.h" +#include "kernel/svm/svm_math_util.h" + +#include "kernel/svm/svm_attribute.h" +#include "kernel/svm/svm_gradient.h" +#include "kernel/svm/svm_blackbody.h" +#include "kernel/svm/svm_closure.h" +#include "kernel/svm/svm_noisetex.h" +#include "kernel/svm/svm_convert.h" +#include "kernel/svm/svm_displace.h" +#include "kernel/svm/svm_fresnel.h" +#include "kernel/svm/svm_wireframe.h" +#include "kernel/svm/svm_wavelength.h" +#include "kernel/svm/svm_camera.h" +#include "kernel/svm/svm_geometry.h" +#include "kernel/svm/svm_hsv.h" +#include "kernel/svm/svm_image.h" +#include "kernel/svm/svm_gamma.h" +#include "kernel/svm/svm_brightness.h" +#include "kernel/svm/svm_invert.h" +#include "kernel/svm/svm_light_path.h" +#include "kernel/svm/svm_magic.h" +#include "kernel/svm/svm_mapping.h" +#include "kernel/svm/svm_normal.h" +#include "kernel/svm/svm_wave.h" +#include "kernel/svm/svm_math.h" +#include "kernel/svm/svm_mix.h" +#include "kernel/svm/svm_ramp.h" +#include "kernel/svm/svm_sepcomb_hsv.h" +#include "kernel/svm/svm_sepcomb_vector.h" +#include "kernel/svm/svm_musgrave.h" +#include "kernel/svm/svm_sky.h" +#include "kernel/svm/svm_tex_coord.h" +#include "kernel/svm/svm_value.h" +#include "kernel/svm/svm_voronoi.h" +#include "kernel/svm/svm_checker.h" +#include "kernel/svm/svm_brick.h" +#include "kernel/svm/svm_vector_transform.h" +#include "kernel/svm/svm_voxel.h" +#include "kernel/svm/svm_bump.h" CCL_NAMESPACE_BEGIN @@ -192,7 +192,7 @@ CCL_NAMESPACE_BEGIN ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderType type, int path_flag) { float stack[SVM_STACK_SIZE]; - int offset = ccl_fetch(sd, shader) & SHADER_MASK; + int offset = sd->shader & SHADER_MASK; while(1) { uint4 node = read_node(kg, &offset); diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h index 0e55c99ae97..229a3f20421 100644 --- a/intern/cycles/kernel/svm/svm_attribute.h +++ b/intern/cycles/kernel/svm/svm_attribute.h @@ -27,7 +27,7 @@ ccl_device AttributeDescriptor svm_node_attr_init(KernelGlobals *kg, ShaderData AttributeDescriptor desc; - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { desc = find_attribute(kg, sd, node.y); if(desc.offset == ATTR_STD_NOT_FOUND) { desc = attribute_not_found(); diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h index 04a8c7b64e5..610d9af9e1f 100644 --- a/intern/cycles/kernel/svm/svm_bump.h +++ b/intern/cycles/kernel/svm/svm_bump.h @@ -21,9 +21,9 @@ CCL_NAMESPACE_BEGIN ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset) { /* save state */ - stack_store_float3(stack, offset+0, ccl_fetch(sd, P)); - stack_store_float3(stack, offset+3, ccl_fetch(sd, dP).dx); - stack_store_float3(stack, offset+6, ccl_fetch(sd, dP).dy); + stack_store_float3(stack, offset+0, sd->P); + stack_store_float3(stack, offset+3, sd->dP.dx); + stack_store_float3(stack, offset+6, sd->dP.dy); /* set state as if undisplaced */ const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_POSITION_UNDISPLACED); @@ -36,18 +36,18 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, floa object_dir_transform(kg, sd, &dPdx); object_dir_transform(kg, sd, &dPdy); - ccl_fetch(sd, P) = P; - ccl_fetch(sd, dP).dx = dPdx; - ccl_fetch(sd, dP).dy = dPdy; + sd->P = P; + sd->dP.dx = dPdx; + sd->dP.dy = dPdy; } } ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset) { /* restore state */ - ccl_fetch(sd, P) = stack_load_float3(stack, offset+0); - ccl_fetch(sd, dP).dx = stack_load_float3(stack, offset+3); - ccl_fetch(sd, dP).dy = stack_load_float3(stack, offset+6); + sd->P = stack_load_float3(stack, offset+0); + sd->dP.dx = stack_load_float3(stack, offset+3); + sd->dP.dy = stack_load_float3(stack, offset+6); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h index 00678a49d70..90249dfd978 100644 --- a/intern/cycles/kernel/svm/svm_camera.h +++ b/intern/cycles/kernel/svm/svm_camera.h @@ -23,7 +23,7 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack, float3 vector; Transform tfm = kernel_data.cam.worldtocamera; - vector = transform_point(&tfm, ccl_fetch(sd, P)); + vector = transform_point(&tfm, sd->P); zdepth = vector.z; distance = len(vector); diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index 024d7d6447a..9a3689a94f4 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -25,13 +25,13 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t bsdf->alpha_y = 0.0f; bsdf->alpha_x = 0.0f; bsdf->ior = eta; - ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf); + sd->flag |= bsdf_refraction_setup(bsdf); } else { bsdf->alpha_y = 0.0f; bsdf->alpha_x = 0.0f; bsdf->ior = 0.0f; - ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf); + sd->flag |= bsdf_reflection_setup(bsdf); } } else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) { @@ -40,9 +40,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t bsdf->ior = eta; if(refract) - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_setup(bsdf); } else { bsdf->alpha_x = roughness; @@ -50,9 +50,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t bsdf->ior = eta; if(refract) - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_setup(bsdf); } } @@ -70,7 +70,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(mix_weight == 0.0f) return; - float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N); + float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N; float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z); float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w); @@ -110,10 +110,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * T = rotate_around_axis(T, N, anisotropic_rotation * M_2PI_F); /* calculate ior */ - float ior = (ccl_fetch(sd, flag) & SD_BACKFACING) ? 1.0f / eta : eta; + float ior = (sd->flag & SD_BACKFACING) ? 1.0f / eta : eta; // calculate fresnel for refraction - float cosNO = dot(N, ccl_fetch(sd, I)); + float cosNO = dot(N, sd->I); float fresnel = fresnel_dielectric_cos(cosNO, ior); // calculate weights of the diffuse and specular part @@ -129,7 +129,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * // get the additional clearcoat normal and subsurface scattering radius uint4 data_cn_ssr = read_node(kg, offset); - float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : ccl_fetch(sd, N); + float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : sd->N; float3 subsurface_radius = stack_valid(data_cn_ssr.y) ? stack_load_float3(stack, data_cn_ssr.y) : make_float3(1.0f, 1.0f, 1.0f); // get the subsurface color @@ -137,7 +137,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * float3 subsurface_color = stack_valid(data_subsurface_color.x) ? stack_load_float3(stack, data_subsurface_color.x) : make_float3(__uint_as_float(data_subsurface_color.y), __uint_as_float(data_subsurface_color.z), __uint_as_float(data_subsurface_color.w)); - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; #ifdef __SUBSURFACE__ float3 albedo = subsurface_color * subsurface + base_color * (1.0f - subsurface); @@ -163,7 +163,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->roughness = roughness; /* setup bsdf */ - ccl_fetch(sd, flag) |= bsdf_principled_diffuse_setup(bsdf); + sd->flag |= bsdf_principled_diffuse_setup(bsdf); } } else if(subsurface > CLOSURE_WEIGHT_CUTOFF && subsurf_sample_weight > CLOSURE_WEIGHT_CUTOFF) { @@ -186,7 +186,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bssrdf->roughness = roughness; /* setup bsdf */ - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID); } bssrdf = bssrdf_alloc(sd, make_float3(0.0f, subsurf_weight.y, 0.0f)); @@ -200,7 +200,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bssrdf->roughness = roughness; /* setup bsdf */ - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID); } bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, subsurf_weight.z)); @@ -214,7 +214,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bssrdf->roughness = roughness; /* setup bsdf */ - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID); } } } @@ -230,7 +230,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->roughness = roughness; /* setup bsdf */ - ccl_fetch(sd, flag) |= bsdf_principled_diffuse_setup(bsdf); + sd->flag |= bsdf_principled_diffuse_setup(bsdf); } } #endif @@ -251,7 +251,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->N = N; /* setup bsdf */ - ccl_fetch(sd, flag) |= bsdf_principled_sheen_setup(bsdf); + sd->flag |= bsdf_principled_sheen_setup(bsdf); } } @@ -286,9 +286,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * /* setup bsdf */ if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */ - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf); else /* use multi-scatter GGX */ - ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf); + sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf); } } #ifdef __CAUSTICS_TRICKS__ @@ -326,7 +326,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->extra->cspec0 = cspec0; /* setup bsdf */ - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_fresnel_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf); } } @@ -350,7 +350,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->ior = ior; /* setup bsdf */ - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf); } } } @@ -371,7 +371,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->extra->cspec0 = cspec0; /* setup bsdf */ - ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf); + sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf); } } } @@ -399,7 +399,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->extra->clearcoat = clearcoat; /* setup bsdf */ - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_clearcoat_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf); } } #ifdef __CAUSTICS_TRICKS__ @@ -409,7 +409,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * break; } case CLOSURE_BSDF_DIFFUSE_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight); if(bsdf) { @@ -418,31 +418,31 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * float roughness = param1; if(roughness == 0.0f) { - ccl_fetch(sd, flag) |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf); + sd->flag |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf); } else { bsdf->roughness = roughness; - ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(bsdf); + sd->flag |= bsdf_oren_nayar_setup(bsdf); } } break; } case CLOSURE_BSDF_TRANSLUCENT_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight); if(bsdf) { bsdf->N = N; - ccl_fetch(sd, flag) |= bsdf_translucent_setup(bsdf); + sd->flag |= bsdf_translucent_setup(bsdf); } break; } case CLOSURE_BSDF_TRANSPARENT_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); if(bsdf) { - ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf); + sd->flag |= bsdf_transparent_setup(bsdf); } break; } @@ -455,7 +455,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); if(bsdf) { @@ -467,21 +467,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * /* setup bsdf */ if(type == CLOSURE_BSDF_REFLECTION_ID) - ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf); + sd->flag |= bsdf_reflection_setup(bsdf); else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID) - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_setup(bsdf); else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID) - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_setup(bsdf); else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID) { kernel_assert(stack_valid(data_node.z)); bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); if(bsdf->extra) { bsdf->extra->color = stack_load_float3(stack, data_node.z); - ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_setup(bsdf); + sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf); } } else - ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(bsdf); + sd->flag |= bsdf_ashikhmin_shirley_setup(bsdf); } break; @@ -493,7 +493,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); if(bsdf) { @@ -501,7 +501,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->extra = NULL; float eta = fmaxf(param2, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; /* setup bsdf */ if(type == CLOSURE_BSDF_REFRACTION_ID) { @@ -509,7 +509,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->alpha_y = 0.0f; bsdf->ior = eta; - ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf); + sd->flag |= bsdf_refraction_setup(bsdf); } else { bsdf->alpha_x = param1; @@ -517,9 +517,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->ior = eta; if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID) - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf); } } @@ -535,14 +535,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * break; } #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; /* index of refraction */ float eta = fmaxf(param2, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; /* fresnel */ - float cosNO = dot(N, ccl_fetch(sd, I)); + float cosNO = dot(N, sd->I); float fresnel = fresnel_dielectric_cos(cosNO, eta); float roughness = param1; @@ -581,7 +581,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_reflective && !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); @@ -593,13 +593,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->alpha_x = param1; bsdf->alpha_y = param1; float eta = fmaxf(param2, 1e-5f); - bsdf->ior = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + bsdf->ior = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; kernel_assert(stack_valid(data_node.z)); bsdf->extra->color = stack_load_float3(stack, data_node.z); /* setup bsdf */ - ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_setup(bsdf); + sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf); } break; @@ -612,7 +612,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); if(bsdf) { @@ -642,33 +642,33 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->ior = 0.0f; if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) { - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_aniso_setup(bsdf); } else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) { - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_aniso_setup(bsdf); } else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID) { kernel_assert(stack_valid(data_node.w)); bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); if(bsdf->extra) { bsdf->extra->color = stack_load_float3(stack, data_node.w); - ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf); + sd->flag |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf); } } else - ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(bsdf); + sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(bsdf); } break; } case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; VelvetBsdf *bsdf = (VelvetBsdf*)bsdf_alloc(sd, sizeof(VelvetBsdf), weight); if(bsdf) { bsdf->N = N; bsdf->sigma = saturate(param1); - ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(bsdf); + sd->flag |= bsdf_ashikhmin_velvet_setup(bsdf); } break; } @@ -678,7 +678,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * break; #endif case CLOSURE_BSDF_DIFFUSE_TOON_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; ToonBsdf *bsdf = (ToonBsdf*)bsdf_alloc(sd, sizeof(ToonBsdf), weight); if(bsdf) { @@ -687,18 +687,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->smooth = param2; if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID) - ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(bsdf); + sd->flag |= bsdf_diffuse_toon_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(bsdf); + sd->flag |= bsdf_glossy_toon_setup(bsdf); } break; } #ifdef __HAIR__ case CLOSURE_BSDF_HAIR_REFLECTION_ID: case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; - if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) { ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); if(bsdf) { @@ -708,7 +708,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * * better figure out a way to skip backfaces from rays * spawned by transmission from the front */ bsdf->weight = make_float3(1.0f, 1.0f, 1.0f); - ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf); + sd->flag |= bsdf_transparent_setup(bsdf); } } else { @@ -722,18 +722,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(stack_valid(data_node.y)) { bsdf->T = normalize(stack_load_float3(stack, data_node.y)); } - else if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) { - bsdf->T = normalize(ccl_fetch(sd, dPdv)); + else if(!(sd->type & PRIMITIVE_ALL_CURVE)) { + bsdf->T = normalize(sd->dPdv); bsdf->offset = 0.0f; } else - bsdf->T = normalize(ccl_fetch(sd, dPdu)); + bsdf->T = normalize(sd->dPdu); if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) { - ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(bsdf); + sd->flag |= bsdf_hair_reflection_setup(bsdf); } else { - ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(bsdf); + sd->flag |= bsdf_hair_transmission_setup(bsdf); } } } @@ -746,8 +746,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * case CLOSURE_BSSRDF_CUBIC_ID: case CLOSURE_BSSRDF_GAUSSIAN_ID: case CLOSURE_BSSRDF_BURLEY_ID: { - float3 albedo = ccl_fetch(sd, svm_closure_weight); - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 albedo = sd->svm_closure_weight; + float3 weight = sd->svm_closure_weight * mix_weight; float sample_weight = fabsf(average(weight)); /* disable in case of diffuse ancestor, can't see it well then and @@ -773,7 +773,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bssrdf->albedo = albedo.x; bssrdf->sharpness = sharpness; bssrdf->N = N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f)); @@ -784,7 +784,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bssrdf->albedo = albedo.y; bssrdf->sharpness = sharpness; bssrdf->N = N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z)); @@ -795,7 +795,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bssrdf->albedo = albedo.z; bssrdf->sharpness = sharpness; bssrdf->N = N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } } @@ -825,21 +825,21 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float switch(type) { case CLOSURE_VOLUME_ABSORPTION_ID: { - float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - ccl_fetch(sd, svm_closure_weight)) * mix_weight * density; + float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sd->svm_closure_weight) * mix_weight * density; ShaderClosure *sc = closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_NONE_ID, weight); if(sc) { - ccl_fetch(sd, flag) |= volume_absorption_setup(sc); + sd->flag |= volume_absorption_setup(sc); } break; } case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight * density; + float3 weight = sd->svm_closure_weight * mix_weight * density; HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc(sd, sizeof(HenyeyGreensteinVolume), weight); if(volume) { volume->g = param2; /* g */ - ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(volume); + sd->flag |= volume_henyey_greenstein_setup(volume); } break; } @@ -859,12 +859,12 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight * mix_weight); } else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight)); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight); - ccl_fetch(sd, flag) |= SD_EMISSION; + sd->flag |= SD_EMISSION; } ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node) @@ -877,10 +877,10 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight * mix_weight); } else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight)); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight); } ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node) @@ -893,12 +893,12 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight * mix_weight); } else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight)); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight); - ccl_fetch(sd, flag) |= SD_HOLDOUT; + sd->flag |= SD_HOLDOUT; } ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node) @@ -911,19 +911,19 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight * mix_weight); } else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight)); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight); - ccl_fetch(sd, flag) |= SD_AO; + sd->flag |= SD_AO; } /* Closure Nodes */ ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight) { - ccl_fetch(sd, svm_closure_weight) = weight; + sd->svm_closure_weight = weight; } ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b) @@ -973,7 +973,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node) ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal) { float3 normal = stack_load_float3(stack, in_direction); - ccl_fetch(sd, N) = normal; + sd->N = normal; stack_store_float3(stack, out_normal, normal); } diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h index 890ab41aaaa..c94fa130af7 100644 --- a/intern/cycles/kernel/svm/svm_displace.h +++ b/intern/cycles/kernel/svm/svm_displace.h @@ -25,10 +25,10 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac uint normal_offset, distance_offset, invert, use_object_space; decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, &use_object_space); - float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); + float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N; - float3 dPdx = ccl_fetch(sd, dP).dx; - float3 dPdy = ccl_fetch(sd, dP).dy; + float3 dPdx = sd->dP.dx; + float3 dPdy = sd->dP.dy; if(use_object_space) { object_inverse_normal_transform(kg, sd, &normal_in); @@ -80,14 +80,14 @@ ccl_device void svm_node_set_displacement(KernelGlobals *kg, ShaderData *sd, flo { float d = stack_load_float(stack, fac_offset); - float3 dP = ccl_fetch(sd, N); + float3 dP = sd->N; object_inverse_normal_transform(kg, sd, &dP); dP *= d*0.1f; /* todo: get rid of this factor */ object_dir_transform(kg, sd, &dP); - ccl_fetch(sd, P) += dP; + sd->P += dP; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h index 23c97d80cb0..3703ec55015 100644 --- a/intern/cycles/kernel/svm/svm_fresnel.h +++ b/intern/cycles/kernel/svm/svm_fresnel.h @@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset, uint normal_offset, out_offset; decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL); float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value); - float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); + float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N; eta = fmaxf(eta, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; - float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta); + float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta); stack_store_float(stack, out_offset, f); } @@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node) decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL); float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value); - float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); + float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N; float f; if(type == NODE_LAYER_WEIGHT_FRESNEL) { float eta = fmaxf(1.0f - blend, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta; + eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta; - f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta); + f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta); } else { - f = fabsf(dot(ccl_fetch(sd, I), normal_in)); + f = fabsf(dot(sd->I, normal_in)); if(blend != 0.5f) { blend = clamp(blend, 0.0f, 1.0f-1e-5f); diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h index 7d512f7ff4d..4a09d9f6653 100644 --- a/intern/cycles/kernel/svm/svm_geometry.h +++ b/intern/cycles/kernel/svm/svm_geometry.h @@ -27,15 +27,15 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg, float3 data; switch(type) { - case NODE_GEOM_P: data = ccl_fetch(sd, P); break; - case NODE_GEOM_N: data = ccl_fetch(sd, N); break; + case NODE_GEOM_P: data = sd->P; break; + case NODE_GEOM_N: data = sd->N; break; #ifdef __DPDU__ case NODE_GEOM_T: data = primitive_tangent(kg, sd); break; #endif - case NODE_GEOM_I: data = ccl_fetch(sd, I); break; - case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break; + case NODE_GEOM_I: data = sd->I; break; + case NODE_GEOM_Ng: data = sd->Ng; break; #ifdef __UV__ - case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break; + case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break; #endif } @@ -48,8 +48,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo float3 data; switch(type) { - case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break; - case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break; + case NODE_GEOM_P: data = sd->P + sd->dP.dx; break; + case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break; default: svm_node_geometry(kg, sd, stack, type, out_offset); return; } @@ -65,8 +65,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo float3 data; switch(type) { - case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break; - case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break; + case NODE_GEOM_P: data = sd->P + sd->dP.dy; break; + case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break; default: svm_node_geometry(kg, sd, stack, type, out_offset); return; } @@ -87,9 +87,9 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s stack_store_float3(stack, out_offset, object_location(kg, sd)); return; } - case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break; + case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break; case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break; - case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break; + case NODE_INFO_OB_RANDOM: data = object_random_number(kg, sd->object); break; default: data = 0.0f; break; } @@ -106,44 +106,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, { switch(type) { case NODE_INFO_PAR_INDEX: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_index(kg, particle_id)); break; } case NODE_INFO_PAR_AGE: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_age(kg, particle_id)); break; } case NODE_INFO_PAR_LIFETIME: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id)); break; } case NODE_INFO_PAR_LOCATION: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float3(stack, out_offset, particle_location(kg, particle_id)); break; } #if 0 /* XXX float4 currently not supported in SVM stack */ case NODE_INFO_PAR_ROTATION: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id)); break; } #endif case NODE_INFO_PAR_SIZE: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_size(kg, particle_id)); break; } case NODE_INFO_PAR_VELOCITY: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id)); break; } case NODE_INFO_PAR_ANGULAR_VELOCITY: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id)); break; } @@ -165,7 +165,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, switch(type) { case NODE_INFO_CURVE_IS_STRAND: { - data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0; + data = (sd->type & PRIMITIVE_ALL_CURVE) != 0; stack_store_float(stack, out_offset, data); break; } @@ -177,7 +177,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, break; } /*case NODE_INFO_CURVE_FADE: { - data = ccl_fetch(sd, curve_transparency); + data = sd->curve_transparency; stack_store_float(stack, out_offset, data); break; }*/ diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index 2afdf61b476..76acc9253a1 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -144,7 +144,6 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break; case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break; case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break; - case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break; default: kernel_assert(0); return make_float4(0.0f, 0.0f, 0.0f, 0.0f); @@ -238,9 +237,9 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) { /* get object space normal */ - float3 N = ccl_fetch(sd, N); + float3 N = sd->N; - N = ccl_fetch(sd, N); + N = sd->N; object_inverse_normal_transform(kg, sd, &N); /* project from direction vector to barycentric coordinates in triangles */ diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h index 04f6f623f18..1492e358608 100644 --- a/intern/cycles/kernel/svm/svm_light_path.h +++ b/intern/cycles/kernel/svm/svm_light_path.h @@ -31,8 +31,8 @@ ccl_device void svm_node_light_path(ShaderData *sd, ccl_addr_space PathState *st case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break; case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break; case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break; - case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break; - case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break; + case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break; + case NODE_LP_ray_length: info = sd->ray_length; break; case NODE_LP_ray_depth: info = (float)state->bounce; break; case NODE_LP_ray_diffuse: info = (float)state->diffuse_bounce; break; case NODE_LP_ray_glossy: info = (float)state->glossy_bounce; break; @@ -56,14 +56,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node) switch(type) { case NODE_LIGHT_FALLOFF_QUADRATIC: break; - case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break; - case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break; + case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break; + case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break; } float smooth = stack_load_float(stack, smooth_offset); if(smooth > 0.0f) { - float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); + float squared = sd->ray_length*sd->ray_length; /* Distant lamps set the ray length to FLT_MAX, which causes squared to overflow. */ if(isfinite(squared)) { strength *= squared/(smooth + squared); diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h index 01547b60014..a7f15de7325 100644 --- a/intern/cycles/kernel/svm/svm_math_util.h +++ b/intern/cycles/kernel/svm/svm_math_util.h @@ -134,32 +134,37 @@ ccl_device float3 svm_math_blackbody_color(float t) { { 6.72595954e-13f, -2.73059993e-08f, 4.24068546e-04f, -7.52204323e-01f }, }; - if(t >= 12000.0f) + int i; + if(t >= 12000.0f) { return make_float3(0.826270103f, 0.994478524f, 1.56626022f); + } + else if(t >= 6365.0f) { + i = 5; + } + else if(t >= 3315.0f) { + i = 4; + } + else if(t >= 1902.0f) { + i = 3; + } + else if(t >= 1449.0f) { + i = 2; + } + else if(t >= 1167.0f) { + i = 1; + } + else if(t >= 965.0f) { + i = 0; + } + else { + /* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */ + return make_float3(4.70366907f, 0.0f, 0.0f); + } - /* Define a macro to reduce stack usage for nvcc */ -#define MAKE_BB_RGB(i) make_float3(\ - rc[i][0] / t + rc[i][1] * t + rc[i][2],\ - gc[i][0] / t + gc[i][1] * t + gc[i][2],\ - ((bc[i][0] * t + bc[i][1]) * t + bc[i][2]) * t + bc[i][3]) - - if(t >= 6365.0f) - return MAKE_BB_RGB(5); - if(t >= 3315.0f) - return MAKE_BB_RGB(4); - if(t >= 1902.0f) - return MAKE_BB_RGB(3); - if(t >= 1449.0f) - return MAKE_BB_RGB(2); - if(t >= 1167.0f) - return MAKE_BB_RGB(1); - if(t >= 965.0f) - return MAKE_BB_RGB(0); - -#undef MAKE_BB_RGB - - /* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */ - return make_float3(4.70366907f, 0.0f, 0.0f); + const float t_inv = 1.0f / t; + return make_float3(rc[i][0] * t_inv + rc[i][1] * t + rc[i][2], + gc[i][0] * t_inv + gc[i][1] * t + gc[i][2], + ((bc[i][0] * t + bc[i][1]) * t + bc[i][2]) * t + bc[i][3]); } ccl_device_inline float3 svm_math_gamma_color(float3 color, float gamma) diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h index 62ff38cf1c5..0347ab7b193 100644 --- a/intern/cycles/kernel/svm/svm_noisetex.h +++ b/intern/cycles/kernel/svm/svm_noisetex.h @@ -18,50 +18,42 @@ CCL_NAMESPACE_BEGIN /* Noise */ -ccl_device_inline void svm_noise(float3 p, float detail, float distortion, float *fac, float3 *color) -{ - int hard = 0; - - if(distortion != 0.0f) { - float3 r, offset = make_float3(13.5f, 13.5f, 13.5f); - - r.x = noise(p + offset) * distortion; - r.y = noise(p) * distortion; - r.z = noise(p - offset) * distortion; - - p += r; - } - - *fac = noise_turbulence(p, detail, hard); - *color = make_float3(*fac, - noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard), - noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard)); -} - ccl_device void svm_node_tex_noise(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) { uint co_offset, scale_offset, detail_offset, distortion_offset, fac_offset, color_offset; decode_node_uchar4(node.y, &co_offset, &scale_offset, &detail_offset, &distortion_offset); + decode_node_uchar4(node.z, &color_offset, &fac_offset, NULL, NULL); uint4 node2 = read_node(kg, offset); float scale = stack_load_float_default(stack, scale_offset, node2.x); float detail = stack_load_float_default(stack, detail_offset, node2.y); float distortion = stack_load_float_default(stack, distortion_offset, node2.z); - float3 co = stack_load_float3(stack, co_offset); + float3 p = stack_load_float3(stack, co_offset) * scale; + int hard = 0; - float3 color; - float f; + if(distortion != 0.0f) { + float3 r, offset = make_float3(13.5f, 13.5f, 13.5f); + + r.x = noise(p + offset) * distortion; + r.y = noise(p) * distortion; + r.z = noise(p - offset) * distortion; - svm_noise(co*scale, detail, distortion, &f, &color); + p += r; + } - decode_node_uchar4(node.z, &color_offset, &fac_offset, NULL, NULL); + float f = noise_turbulence(p, detail, hard); - if(stack_valid(fac_offset)) + if(stack_valid(fac_offset)) { stack_store_float(stack, fac_offset, f); - if(stack_valid(color_offset)) + } + if(stack_valid(color_offset)) { + float3 color = make_float3(f, + noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard), + noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard)); stack_store_float3(stack, color_offset, color); + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h index c0b01262212..c94327401f5 100644 --- a/intern/cycles/kernel/svm/svm_tex_coord.h +++ b/intern/cycles/kernel/svm/svm_tex_coord.h @@ -31,9 +31,9 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = ccl_fetch(sd, P); + data = sd->P; if(node.w == 0) { - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -48,47 +48,47 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, break; } case NODE_TEXCO_NORMAL: { - data = ccl_fetch(sd, N); + data = sd->N; object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = transform_point(&tfm, ccl_fetch(sd, P)); + if(sd->object != OBJECT_NONE) + data = transform_point(&tfm, sd->P); else - data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg)); + data = transform_point(&tfm, sd->P + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P)); + if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, sd->ray_P); else - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P)); + data = camera_world_to_ndc(kg, sd, sd->P); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); + if(sd->object != OBJECT_NONE) + data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; else - data = ccl_fetch(sd, I); + data = sd->I; break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, ccl_fetch(sd, object)); + data = object_dupli_generated(kg, sd->object); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, ccl_fetch(sd, object)); + data = object_dupli_uv(kg, sd->object); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = ccl_fetch(sd, P); + data = sd->P; #ifdef __VOLUME__ - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -112,9 +112,9 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; + data = sd->P + sd->dP.dx; if(node.w == 0) { - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -129,47 +129,47 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, break; } case NODE_TEXCO_NORMAL: { - data = ccl_fetch(sd, N); + data = sd->N; object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx); + if(sd->object != OBJECT_NONE) + data = transform_point(&tfm, sd->P + sd->dP.dx); else - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg)); + data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx); + if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx); else - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx); + data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); + if(sd->object != OBJECT_NONE) + data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; else - data = ccl_fetch(sd, I); + data = sd->I; break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, ccl_fetch(sd, object)); + data = object_dupli_generated(kg, sd->object); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, ccl_fetch(sd, object)); + data = object_dupli_uv(kg, sd->object); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; + data = sd->P + sd->dP.dx; #ifdef __VOLUME__ - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -196,9 +196,9 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; + data = sd->P + sd->dP.dy; if(node.w == 0) { - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -213,47 +213,47 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, break; } case NODE_TEXCO_NORMAL: { - data = ccl_fetch(sd, N); + data = sd->N; object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy); + if(sd->object != OBJECT_NONE) + data = transform_point(&tfm, sd->P + sd->dP.dy); else - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg)); + data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy); + if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy); else - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy); + data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); + if(sd->object != OBJECT_NONE) + data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; else - data = ccl_fetch(sd, I); + data = sd->I; break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, ccl_fetch(sd, object)); + data = object_dupli_generated(kg, sd->object); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, ccl_fetch(sd, object)); + data = object_dupli_uv(kg, sd->object); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; + data = sd->P + sd->dP.dy; #ifdef __VOLUME__ - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -274,12 +274,12 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st float3 color = stack_load_float3(stack, color_offset); color = 2.0f*make_float3(color.x - 0.5f, color.y - 0.5f, color.z - 0.5f); - bool is_backfacing = (ccl_fetch(sd, flag) & SD_BACKFACING) != 0; + bool is_backfacing = (sd->flag & SD_BACKFACING) != 0; float3 N; if(space == NODE_NORMAL_MAP_TANGENT) { /* tangent space */ - if(ccl_fetch(sd, object) == OBJECT_NONE) { + if(sd->object == OBJECT_NONE) { stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f)); return; } @@ -299,11 +299,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st float sign = primitive_attribute_float(kg, sd, attr_sign, NULL, NULL); float3 normal; - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { + if(sd->shader & SHADER_SMOOTH_NORMAL) { normal = primitive_attribute_float3(kg, sd, attr_normal, NULL, NULL); } else { - normal = ccl_fetch(sd, Ng); + normal = sd->Ng; /* the normal is already inverted, which is too soon for the math here */ if(is_backfacing) { @@ -345,11 +345,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st if(strength != 1.0f) { strength = max(strength, 0.0f); - N = safe_normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength); + N = safe_normalize(sd->N + (N - sd->N)*strength); } if(is_zero(N)) { - N = ccl_fetch(sd, N); + N = sd->N; } stack_store_float3(stack, normal_offset, N); @@ -377,7 +377,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack float3 generated; if(desc.offset == ATTR_STD_NOT_FOUND) - generated = ccl_fetch(sd, P); + generated = sd->P; else generated = primitive_attribute_float3(kg, sd, desc, NULL, NULL); @@ -390,7 +390,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack } object_normal_transform(kg, sd, &tangent); - tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N)))); + tangent = cross(sd->N, normalize(cross(tangent, sd->N))); stack_store_float3(stack, tangent_offset, tangent); } diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h index 4c32130d06d..4e92f27acdb 100644 --- a/intern/cycles/kernel/svm/svm_vector_transform.h +++ b/intern/cycles/kernel/svm/svm_vector_transform.h @@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito; Transform tfm; - bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE); + bool is_object = (sd->object != OBJECT_NONE); bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL); /* From world */ diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h index a8b3604a8a7..9e826c8c23f 100644 --- a/intern/cycles/kernel/svm/svm_voxel.h +++ b/intern/cycles/kernel/svm/svm_voxel.h @@ -46,7 +46,7 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg, # if defined(__KERNEL_CUDA__) # if __CUDA_ARCH__ >= 300 CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); - if(id < 2048) /* TODO(dingto): Make this a variable */ + if(id < TEX_START_HALF4_CUDA_KEPLER) r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z); else { float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z); diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h index 87e40791333..3c6353c8001 100644 --- a/intern/cycles/kernel/svm/svm_wireframe.h +++ b/intern/cycles/kernel/svm/svm_wireframe.h @@ -41,9 +41,9 @@ ccl_device_inline float wireframe(KernelGlobals *kg, float3 *P) { #ifdef __HAIR__ - if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) + if(sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE) #else - if(ccl_fetch(sd, prim) != PRIM_NONE) + if(sd->prim != PRIM_NONE) #endif { float3 Co[3]; @@ -52,12 +52,12 @@ ccl_device_inline float wireframe(KernelGlobals *kg, /* Triangles */ int np = 3; - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) - triangle_vertices(kg, ccl_fetch(sd, prim), Co); + if(sd->type & PRIMITIVE_TRIANGLE) + triangle_vertices(kg, sd->prim, Co); else - motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co); + motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co); - if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) { + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { object_position_transform(kg, sd, &Co[0]); object_position_transform(kg, sd, &Co[1]); object_position_transform(kg, sd, &Co[2]); @@ -66,8 +66,8 @@ ccl_device_inline float wireframe(KernelGlobals *kg, if(pixel_size) { // Project the derivatives of P to the viewing plane defined // by I so we have a measure of how big is a pixel at this point - float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I)); - float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I)); + float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I); + float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I); // Take the average of both axis' length pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f; } @@ -113,20 +113,20 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, * With OpenCL 2.0 it's possible to avoid this change, but for until * then we'll be living with such an exception. */ - float3 P = ccl_fetch(sd, P); + float3 P = sd->P; float f = wireframe(kg, sd, size, pixel_size, &P); #else - float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P)); + float f = wireframe(kg, sd, size, pixel_size, &sd->P); #endif /* TODO(sergey): Think of faster way to calculate derivatives. */ if(bump_offset == NODE_BUMP_OFFSET_DX) { - float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx; - f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx); + float3 Px = sd->P - sd->dP.dx; + f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx); } else if(bump_offset == NODE_BUMP_OFFSET_DY) { - float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy; - f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy); + float3 Py = sd->P - sd->dP.dy; + f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy); } if(stack_valid(out_fac)) diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt index 8eaa9de3874..17ac66644e2 100644 --- a/intern/cycles/render/CMakeLists.txt +++ b/intern/cycles/render/CMakeLists.txt @@ -1,14 +1,6 @@ set(INC - . - ../device - ../graph - ../kernel - ../kernel/svm - ../kernel/osl - ../bvh - ../subd - ../util + .. ../../glew-mx ) diff --git a/intern/cycles/render/attribute.cpp b/intern/cycles/render/attribute.cpp index c0d429a583c..e157a385904 100644 --- a/intern/cycles/render/attribute.cpp +++ b/intern/cycles/render/attribute.cpp @@ -14,13 +14,13 @@ * limitations under the License. */ -#include "image.h" -#include "mesh.h" -#include "attribute.h" +#include "render/image.h" +#include "render/mesh.h" +#include "render/attribute.h" -#include "util_debug.h" -#include "util_foreach.h" -#include "util_transform.h" +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_transform.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/attribute.h b/intern/cycles/render/attribute.h index f4538c76369..a64eb6542d5 100644 --- a/intern/cycles/render/attribute.h +++ b/intern/cycles/render/attribute.h @@ -17,12 +17,12 @@ #ifndef __ATTRIBUTE_H__ #define __ATTRIBUTE_H__ -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "util_list.h" -#include "util_param.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_list.h" +#include "util/util_param.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp index 8d7d7b847fd..930debe1e33 100644 --- a/intern/cycles/render/background.cpp +++ b/intern/cycles/render/background.cpp @@ -14,17 +14,17 @@ * limitations under the License. */ -#include "background.h" -#include "device.h" -#include "integrator.h" -#include "graph.h" -#include "nodes.h" -#include "scene.h" -#include "shader.h" - -#include "util_foreach.h" -#include "util_math.h" -#include "util_types.h" +#include "render/background.h" +#include "device/device.h" +#include "render/integrator.h" +#include "render/graph.h" +#include "render/nodes.h" +#include "render/scene.h" +#include "render/shader.h" + +#include "util/util_foreach.h" +#include "util/util_math.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h index 8029c6a9e80..db20b6ebf87 100644 --- a/intern/cycles/render/background.h +++ b/intern/cycles/render/background.h @@ -17,9 +17,9 @@ #ifndef __BACKGROUND_H__ #define __BACKGROUND_H__ -#include "node.h" +#include "graph/node.h" -#include "util_types.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -30,7 +30,7 @@ class Shader; class Background : public Node { public: - NODE_DECLARE; + NODE_DECLARE float ao_factor; float ao_distance; diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp index d9a297002c6..c0fcd517390 100644 --- a/intern/cycles/render/bake.cpp +++ b/intern/cycles/render/bake.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "bake.h" -#include "integrator.h" +#include "render/bake.h" +#include "render/integrator.h" CCL_NAMESPACE_BEGIN @@ -171,9 +171,9 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre /* needs to be up to data for attribute access */ device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); - device->mem_alloc(d_input, MEM_READ_ONLY); + device->mem_alloc("bake_input", d_input, MEM_READ_ONLY); device->mem_copy_to(d_input); - device->mem_alloc(d_output, MEM_READ_WRITE); + device->mem_alloc("bake_output", d_output, MEM_READ_WRITE); DeviceTask task(DeviceTask::SHADER); task.shader_input = d_input.device_pointer; diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h index 25f5eb3c897..ceb94cfb682 100644 --- a/intern/cycles/render/bake.h +++ b/intern/cycles/render/bake.h @@ -17,11 +17,11 @@ #ifndef __BAKE_H__ #define __BAKE_H__ -#include "device.h" -#include "scene.h" +#include "device/device.h" +#include "render/scene.h" -#include "util_progress.h" -#include "util_vector.h" +#include "util/util_progress.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -73,7 +73,7 @@ public: bool need_update; - int total_pixel_samples; + size_t total_pixel_samples; private: BakeData *m_bake_data; diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index f1692712d61..fe2c2e78926 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -16,17 +16,17 @@ #include <stdlib.h> -#include "buffers.h" -#include "device.h" - -#include "util_debug.h" -#include "util_foreach.h" -#include "util_hash.h" -#include "util_image.h" -#include "util_math.h" -#include "util_opengl.h" -#include "util_time.h" -#include "util_types.h" +#include "render/buffers.h" +#include "device/device.h" + +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_hash.h" +#include "util/util_image.h" +#include "util/util_math.h" +#include "util/util_opengl.h" +#include "util/util_time.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -129,13 +129,13 @@ void RenderBuffers::reset(Device *device, BufferParams& params_) /* allocate buffer */ buffer.resize(params.width*params.height*params.get_passes_size()); - device->mem_alloc(buffer, MEM_READ_WRITE); + device->mem_alloc("render_buffer", buffer, MEM_READ_WRITE); device->mem_zero(buffer); /* allocate rng state */ rng_state.resize(params.width, params.height); - device->mem_alloc(rng_state, MEM_READ_WRITE); + device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE); } bool RenderBuffers::copy_from_device() diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h index c9c2a21079a..5c78971678a 100644 --- a/intern/cycles/render/buffers.h +++ b/intern/cycles/render/buffers.h @@ -17,16 +17,16 @@ #ifndef __BUFFERS_H__ #define __BUFFERS_H__ -#include "device_memory.h" +#include "device/device_memory.h" -#include "film.h" +#include "render/film.h" -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "util_half.h" -#include "util_string.h" -#include "util_thread.h" -#include "util_types.h" +#include "util/util_half.h" +#include "util/util_string.h" +#include "util/util_thread.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp index c8c51ec96d2..83ff8a10618 100644 --- a/intern/cycles/render/camera.cpp +++ b/intern/cycles/render/camera.cpp @@ -14,18 +14,18 @@ * limitations under the License. */ -#include "camera.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "tables.h" - -#include "device.h" - -#include "util_foreach.h" -#include "util_function.h" -#include "util_math_cdf.h" -#include "util_vector.h" +#include "render/camera.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/tables.h" + +#include "device/device.h" + +#include "util/util_foreach.h" +#include "util/util_function.h" +#include "util/util_math_cdf.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h index 141ef9cccef..dd6b831b347 100644 --- a/intern/cycles/render/camera.h +++ b/intern/cycles/render/camera.h @@ -17,13 +17,13 @@ #ifndef __CAMERA_H__ #define __CAMERA_H__ -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "node.h" +#include "graph/node.h" -#include "util_boundbox.h" -#include "util_transform.h" -#include "util_types.h" +#include "util/util_boundbox.h" +#include "util/util_transform.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -39,7 +39,7 @@ class Scene; class Camera : public Node { public: - NODE_DECLARE; + NODE_DECLARE /* Specifies an offset for the shutter's time interval. */ enum MotionPosition { diff --git a/intern/cycles/render/constant_fold.cpp b/intern/cycles/render/constant_fold.cpp index b7f25663bc3..2569d9eec27 100644 --- a/intern/cycles/render/constant_fold.cpp +++ b/intern/cycles/render/constant_fold.cpp @@ -14,11 +14,11 @@ * limitations under the License. */ -#include "constant_fold.h" -#include "graph.h" +#include "render/constant_fold.h" +#include "render/graph.h" -#include "util_foreach.h" -#include "util_logging.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/constant_fold.h b/intern/cycles/render/constant_fold.h index 7962698319f..33f93b8c0ab 100644 --- a/intern/cycles/render/constant_fold.h +++ b/intern/cycles/render/constant_fold.h @@ -17,8 +17,8 @@ #ifndef __CONSTANT_FOLD_H__ #define __CONSTANT_FOLD_H__ -#include "util_types.h" -#include "svm_types.h" +#include "util/util_types.h" +#include "kernel/svm/svm_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/curves.cpp b/intern/cycles/render/curves.cpp index f671eb19cae..4c085b928fb 100644 --- a/intern/cycles/render/curves.cpp +++ b/intern/cycles/render/curves.cpp @@ -14,16 +14,16 @@ * limitations under the License. */ -#include "device.h" -#include "curves.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" - -#include "util_foreach.h" -#include "util_map.h" -#include "util_progress.h" -#include "util_vector.h" +#include "device/device.h" +#include "render/curves.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" + +#include "util/util_foreach.h" +#include "util/util_map.h" +#include "util/util_progress.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/curves.h b/intern/cycles/render/curves.h index e41967eebf5..8834764bd63 100644 --- a/intern/cycles/render/curves.h +++ b/intern/cycles/render/curves.h @@ -17,8 +17,8 @@ #ifndef __CURVES_H__ #define __CURVES_H__ -#include "util_types.h" -#include "util_vector.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp index 923252bb375..7809f4345f1 100644 --- a/intern/cycles/render/film.cpp +++ b/intern/cycles/render/film.cpp @@ -14,19 +14,19 @@ * limitations under the License. */ -#include "camera.h" -#include "device.h" -#include "film.h" -#include "integrator.h" -#include "mesh.h" -#include "scene.h" -#include "tables.h" - -#include "util_algorithm.h" -#include "util_debug.h" -#include "util_foreach.h" -#include "util_math.h" -#include "util_math_cdf.h" +#include "render/camera.h" +#include "device/device.h" +#include "render/film.h" +#include "render/integrator.h" +#include "render/mesh.h" +#include "render/scene.h" +#include "render/tables.h" + +#include "util/util_algorithm.h" +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_math.h" +#include "util/util_math_cdf.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h index 9fa51c51f52..83c941d5c57 100644 --- a/intern/cycles/render/film.h +++ b/intern/cycles/render/film.h @@ -17,12 +17,12 @@ #ifndef __FILM_H__ #define __FILM_H__ -#include "util_string.h" -#include "util_vector.h" +#include "util/util_string.h" +#include "util/util_vector.h" -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "node.h" +#include "graph/node.h" CCL_NAMESPACE_BEGIN @@ -53,7 +53,7 @@ public: class Film : public Node { public: - NODE_DECLARE; + NODE_DECLARE float exposure; array<Pass> passes; diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp index 52c94ec2716..8e61daab49b 100644 --- a/intern/cycles/render/graph.cpp +++ b/intern/cycles/render/graph.cpp @@ -14,17 +14,18 @@ * limitations under the License. */ -#include "attribute.h" -#include "graph.h" -#include "nodes.h" -#include "shader.h" -#include "constant_fold.h" - -#include "util_algorithm.h" -#include "util_debug.h" -#include "util_foreach.h" -#include "util_queue.h" -#include "util_logging.h" +#include "render/attribute.h" +#include "render/graph.h" +#include "render/nodes.h" +#include "render/scene.h" +#include "render/shader.h" +#include "render/constant_fold.h" + +#include "util/util_algorithm.h" +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_queue.h" +#include "util/util_logging.h" CCL_NAMESPACE_BEGIN @@ -195,6 +196,7 @@ bool ShaderNode::equals(const ShaderNode& other) ShaderGraph::ShaderGraph() { finalized = false; + simplified = false; num_node_ids = 0; add(new OutputNode()); } @@ -207,6 +209,8 @@ ShaderGraph::~ShaderGraph() ShaderNode *ShaderGraph::add(ShaderNode *node) { assert(!finalized); + simplified = false; + node->id = num_node_ids++; nodes.push_back(node); return node; @@ -241,6 +245,7 @@ void ShaderGraph::connect(ShaderOutput *from, ShaderInput *to) { assert(!finalized); assert(from && to); + simplified = false; if(to->link) { fprintf(stderr, "Cycles shader graph connect: input already connected.\n"); @@ -273,6 +278,7 @@ void ShaderGraph::connect(ShaderOutput *from, ShaderInput *to) void ShaderGraph::disconnect(ShaderOutput *from) { assert(!finalized); + simplified = false; foreach(ShaderInput *sock, from->links) { sock->link = NULL; @@ -285,6 +291,7 @@ void ShaderGraph::disconnect(ShaderInput *to) { assert(!finalized); assert(to->link); + simplified = false; ShaderOutput *from = to->link; @@ -294,6 +301,8 @@ void ShaderGraph::disconnect(ShaderInput *to) void ShaderGraph::relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to) { + simplified = false; + /* Copy because disconnect modifies this list */ vector<ShaderInput*> outputs = from->links; @@ -310,9 +319,19 @@ void ShaderGraph::relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to) } } +void ShaderGraph::simplify(Scene *scene) +{ + if(!simplified) { + default_inputs(scene->shader_manager->use_osl()); + clean(scene); + refine_bump_nodes(); + + simplified = true; + } +} + void ShaderGraph::finalize(Scene *scene, bool do_bump, - bool do_osl, bool do_simplify, bool bump_in_object_space) { @@ -322,9 +341,7 @@ void ShaderGraph::finalize(Scene *scene, * modified afterwards. */ if(!finalized) { - default_inputs(do_osl); - clean(scene); - refine_bump_nodes(); + simplify(scene); if(do_bump) bump_from_displacement(bump_in_object_space); diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h index 780fdf49ca4..09932695d1f 100644 --- a/intern/cycles/render/graph.h +++ b/intern/cycles/render/graph.h @@ -17,17 +17,17 @@ #ifndef __GRAPH_H__ #define __GRAPH_H__ -#include "node.h" -#include "node_type.h" +#include "graph/node.h" +#include "graph/node_type.h" -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "util_list.h" -#include "util_map.h" -#include "util_param.h" -#include "util_set.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_list.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_set.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -201,14 +201,14 @@ public: /* Node definition utility macros */ #define SHADER_NODE_CLASS(type) \ - NODE_DECLARE; \ + NODE_DECLARE \ type(); \ virtual ShaderNode *clone() const { return new type(*this); } \ virtual void compile(SVMCompiler& compiler); \ virtual void compile(OSLCompiler& compiler); \ #define SHADER_NODE_NO_CLONE_CLASS(type) \ - NODE_DECLARE; \ + NODE_DECLARE \ type(); \ virtual void compile(SVMCompiler& compiler); \ virtual void compile(OSLCompiler& compiler); \ @@ -240,6 +240,7 @@ public: list<ShaderNode*> nodes; size_t num_node_ids; bool finalized; + bool simplified; ShaderGraph(); ~ShaderGraph(); @@ -255,9 +256,9 @@ public: void relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to); void remove_proxy_nodes(); + void simplify(Scene *scene); void finalize(Scene *scene, bool do_bump = false, - bool do_osl = false, bool do_simplify = false, bool bump_in_object_space = false); diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp index fd8a1262208..a8c4f446bea 100644 --- a/intern/cycles/render/image.cpp +++ b/intern/cycles/render/image.cpp @@ -14,15 +14,15 @@ * limitations under the License. */ -#include "device.h" -#include "image.h" -#include "scene.h" +#include "device/device.h" +#include "render/image.h" +#include "render/scene.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_path.h" -#include "util_progress.h" -#include "util_texture.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_path.h" +#include "util/util_progress.h" +#include "util/util_texture.h" #ifdef WITH_OSL #include <OSL/oslexec.h> @@ -156,6 +156,16 @@ ImageManager::ImageDataType ImageManager::get_image_metadata(const string& filen } } + /* Perform preliminary checks, with meaningful logging. */ + if(!path_exists(filename)) { + VLOG(1) << "File '" << filename << "' does not exist."; + return IMAGE_DATA_TYPE_BYTE4; + } + if(path_is_directory(filename)) { + VLOG(1) << "File '" << filename << "' is a directory, can't use as image."; + return IMAGE_DATA_TYPE_BYTE4; + } + ImageInput *in = ImageInput::create(filename); if(in) { @@ -285,9 +295,8 @@ int ImageManager::add_image(const string& filename, thread_scoped_lock device_lock(device_mutex); - /* Do we have a float? */ - if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4) - is_float = true; + /* Check whether it's a float texture. */ + is_float = (type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4); /* No single channel and half textures on CUDA (Fermi) and no half on OpenCL, use available slots */ if((type == IMAGE_DATA_TYPE_FLOAT || @@ -433,6 +442,11 @@ bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &wid return false; if(!img->builtin_data) { + /* NOTE: Error logging is done in meta data acquisition. */ + if(!path_exists(img->filename) || path_is_directory(img->filename)) { + return false; + } + /* load image from file through OIIO */ *in = ImageInput::create(img->filename); diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h index 494c74f0cdd..996b5a5b65f 100644 --- a/intern/cycles/render/image.h +++ b/intern/cycles/render/image.h @@ -17,13 +17,13 @@ #ifndef __IMAGE_H__ #define __IMAGE_H__ -#include "device.h" -#include "device_memory.h" +#include "device/device.h" +#include "device/device_memory.h" -#include "util_image.h" -#include "util_string.h" -#include "util_thread.h" -#include "util_vector.h" +#include "util/util_image.h" +#include "util/util_string.h" +#include "util/util_thread.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp index 1ab0f9874f2..a004bb5b856 100644 --- a/intern/cycles/render/integrator.cpp +++ b/intern/cycles/render/integrator.cpp @@ -14,16 +14,16 @@ * limitations under the License. */ -#include "device.h" -#include "integrator.h" -#include "film.h" -#include "light.h" -#include "scene.h" -#include "shader.h" -#include "sobol.h" - -#include "util_foreach.h" -#include "util_hash.h" +#include "device/device.h" +#include "render/integrator.h" +#include "render/film.h" +#include "render/light.h" +#include "render/scene.h" +#include "render/shader.h" +#include "render/sobol.h" + +#include "util/util_foreach.h" +#include "util/util_hash.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h index 27fff4831e5..9501d7f8416 100644 --- a/intern/cycles/render/integrator.h +++ b/intern/cycles/render/integrator.h @@ -17,9 +17,9 @@ #ifndef __INTEGRATOR_H__ #define __INTEGRATOR_H__ -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "node.h" +#include "graph/node.h" CCL_NAMESPACE_BEGIN @@ -29,7 +29,7 @@ class Scene; class Integrator : public Node { public: - NODE_DECLARE; + NODE_DECLARE int min_bounce; int max_bounce; diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp index 2245c861d5a..4886dcd563f 100644 --- a/intern/cycles/render/light.cpp +++ b/intern/cycles/render/light.cpp @@ -14,19 +14,19 @@ * limitations under the License. */ -#include "background.h" -#include "device.h" -#include "integrator.h" -#include "film.h" -#include "light.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "shader.h" - -#include "util_foreach.h" -#include "util_progress.h" -#include "util_logging.h" +#include "render/background.h" +#include "device/device.h" +#include "render/integrator.h" +#include "render/film.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/shader.h" + +#include "util/util_foreach.h" +#include "util/util_progress.h" +#include "util/util_logging.h" CCL_NAMESPACE_BEGIN @@ -57,9 +57,9 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); - device->mem_alloc(d_input, MEM_READ_ONLY); + device->mem_alloc("shade_background_pixels_input", d_input, MEM_READ_ONLY); device->mem_copy_to(d_input); - device->mem_alloc(d_output, MEM_WRITE_ONLY); + device->mem_alloc("shade_background_pixels_output", d_output, MEM_WRITE_ONLY); DeviceTask main_task(DeviceTask::SHADER); main_task.shader_input = d_input.device_pointer; @@ -486,10 +486,18 @@ static void background_cdf(int start, float2 *cond_cdf) { /* Conditional CDFs (rows, U direction). */ + /* NOTE: It is possible to have some NaN pixels on background + * which will ruin CDF causing wrong shading. We replace such + * pixels with black. + */ for(int i = start; i < end; i++) { float sin_theta = sinf(M_PI_F * (i + 0.5f) / res); float3 env_color = (*pixels)[i * res]; float ave_luminance = average(env_color); + /* TODO(sergey): Consider adding average_safe(). */ + if(!isfinite(ave_luminance)) { + ave_luminance = 0.0f; + } cond_cdf[i * cdf_count].x = ave_luminance * sin_theta; cond_cdf[i * cdf_count].y = 0.0f; @@ -497,6 +505,9 @@ static void background_cdf(int start, for(int j = 1; j < res; j++) { env_color = (*pixels)[i * res + j]; ave_luminance = average(env_color); + if(!isfinite(ave_luminance)) { + ave_luminance = 0.0f; + } cond_cdf[i * cdf_count + j].x = ave_luminance * sin_theta; cond_cdf[i * cdf_count + j].y = cond_cdf[i * cdf_count + j - 1].y + cond_cdf[i * cdf_count + j - 1].x / res; diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h index f56530b6490..7e9014eb823 100644 --- a/intern/cycles/render/light.h +++ b/intern/cycles/render/light.h @@ -17,12 +17,12 @@ #ifndef __LIGHT_H__ #define __LIGHT_H__ -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "node.h" +#include "graph/node.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp index c42b32919d4..a4dc06c4345 100644 --- a/intern/cycles/render/mesh.cpp +++ b/intern/cycles/render/mesh.cpp @@ -14,29 +14,29 @@ * limitations under the License. */ -#include "bvh.h" -#include "bvh_build.h" - -#include "camera.h" -#include "curves.h" -#include "device.h" -#include "graph.h" -#include "shader.h" -#include "light.h" -#include "mesh.h" -#include "nodes.h" -#include "object.h" -#include "scene.h" - -#include "osl_globals.h" - -#include "subd_split.h" -#include "subd_patch_table.h" - -#include "util_foreach.h" -#include "util_logging.h" -#include "util_progress.h" -#include "util_set.h" +#include "bvh/bvh.h" +#include "bvh/bvh_build.h" + +#include "render/camera.h" +#include "render/curves.h" +#include "device/device.h" +#include "render/graph.h" +#include "render/shader.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/nodes.h" +#include "render/object.h" +#include "render/scene.h" + +#include "kernel/osl/osl_globals.h" + +#include "subd/subd_split.h" +#include "subd/subd_patch_table.h" + +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_progress.h" +#include "util/util_set.h" CCL_NAMESPACE_BEGIN @@ -1873,9 +1873,14 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene * dscene->prim_object.reference((uint*)&pack.prim_object[0], pack.prim_object.size()); device->tex_alloc("__prim_object", dscene->prim_object); } + if(pack.prim_time.size()) { + dscene->prim_time.reference((float2*)&pack.prim_time[0], pack.prim_time.size()); + device->tex_alloc("__prim_time", dscene->prim_time); + } dscene->data.bvh.root = pack.root_index; dscene->data.bvh.use_qbvh = scene->params.use_qbvh; + dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0); } void MeshManager::device_update_flags(Device * /*device*/, @@ -2152,6 +2157,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene) device->tex_free(dscene->prim_visibility); device->tex_free(dscene->prim_index); device->tex_free(dscene->prim_object); + device->tex_free(dscene->prim_time); device->tex_free(dscene->tri_shader); device->tex_free(dscene->tri_vnormal); device->tex_free(dscene->tri_vindex); @@ -2173,6 +2179,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene) dscene->prim_visibility.clear(); dscene->prim_index.clear(); dscene->prim_object.clear(); + dscene->prim_time.clear(); dscene->tri_shader.clear(); dscene->tri_vnormal.clear(); dscene->tri_vindex.clear(); diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h index 5f33e30eac2..043ce9d0ffc 100644 --- a/intern/cycles/render/mesh.h +++ b/intern/cycles/render/mesh.h @@ -17,17 +17,18 @@ #ifndef __MESH_H__ #define __MESH_H__ -#include "attribute.h" -#include "node.h" -#include "shader.h" - -#include "util_boundbox.h" -#include "util_list.h" -#include "util_map.h" -#include "util_param.h" -#include "util_transform.h" -#include "util_types.h" -#include "util_vector.h" +#include "graph/node.h" + +#include "render/attribute.h" +#include "render/shader.h" + +#include "util/util_boundbox.h" +#include "util/util_list.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_transform.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -48,7 +49,7 @@ struct PackedPatchTable; class Mesh : public Node { public: - NODE_DECLARE; + NODE_DECLARE /* Mesh Triangle */ struct Triangle { diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp index adc5b820298..cf28bb16bb7 100644 --- a/intern/cycles/render/mesh_displace.cpp +++ b/intern/cycles/render/mesh_displace.cpp @@ -14,15 +14,15 @@ * limitations under the License. */ -#include "device.h" +#include "device/device.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "shader.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/shader.h" -#include "util_foreach.h" -#include "util_progress.h" +#include "util/util_foreach.h" +#include "util/util_progress.h" CCL_NAMESPACE_BEGIN @@ -121,9 +121,9 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me /* needs to be up to data for attribute access */ device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); - device->mem_alloc(d_input, MEM_READ_ONLY); + device->mem_alloc("displace_input", d_input, MEM_READ_ONLY); device->mem_copy_to(d_input); - device->mem_alloc(d_output, MEM_WRITE_ONLY); + device->mem_alloc("displace_output", d_output, MEM_WRITE_ONLY); DeviceTask task(DeviceTask::SHADER); task.shader_input = d_input.device_pointer; diff --git a/intern/cycles/render/mesh_subdivision.cpp b/intern/cycles/render/mesh_subdivision.cpp index 57c76a9f1c8..585ed77b026 100644 --- a/intern/cycles/render/mesh_subdivision.cpp +++ b/intern/cycles/render/mesh_subdivision.cpp @@ -14,16 +14,16 @@ * limitations under the License. */ -#include "mesh.h" -#include "attribute.h" -#include "camera.h" +#include "render/mesh.h" +#include "render/attribute.h" +#include "render/camera.h" -#include "subd_split.h" -#include "subd_patch.h" -#include "subd_patch_table.h" +#include "subd/subd_split.h" +#include "subd/subd_patch.h" +#include "subd/subd_patch_table.h" -#include "util_foreach.h" -#include "util_algorithm.h" +#include "util/util_foreach.h" +#include "util/util_algorithm.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp index c02c1adb989..3f56690d0c1 100644 --- a/intern/cycles/render/nodes.cpp +++ b/intern/cycles/render/nodes.cpp @@ -14,20 +14,21 @@ * limitations under the License. */ -#include "image.h" -#include "integrator.h" -#include "nodes.h" -#include "scene.h" -#include "svm.h" -#include "svm_color_util.h" -#include "svm_ramp_util.h" -#include "svm_math_util.h" -#include "osl.h" -#include "constant_fold.h" - -#include "util_sky_model.h" -#include "util_foreach.h" -#include "util_transform.h" +#include "render/image.h" +#include "render/integrator.h" +#include "render/nodes.h" +#include "render/scene.h" +#include "render/svm.h" +#include "kernel/svm/svm_color_util.h" +#include "kernel/svm/svm_ramp_util.h" +#include "kernel/svm/svm_math_util.h" +#include "render/osl.h" +#include "render/constant_fold.h" + +#include "util/util_sky_model.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_transform.h" CCL_NAMESPACE_BEGIN @@ -1931,21 +1932,38 @@ GlossyBsdfNode::GlossyBsdfNode() void GlossyBsdfNode::simplify_settings(Scene *scene) { if(distribution_orig == NBUILTIN_CLOSURES) { + roughness_orig = roughness; distribution_orig = distribution; } + else { + /* By default we use original values, so we don't worry about restoring + * defaults later one and can only do override when needed. + */ + roughness = roughness_orig; + distribution = distribution_orig; + } Integrator *integrator = scene->integrator; + ShaderInput *roughness_input = input("Roughness"); if(integrator->filter_glossy == 0.0f) { /* Fallback to Sharp closure for Roughness close to 0. * Note: Keep the epsilon in sync with kernel! */ - ShaderInput *roughness_input = input("Roughness"); if(!roughness_input->link && roughness <= 1e-4f) { + VLOG(1) << "Using sharp glossy BSDF."; distribution = CLOSURE_BSDF_REFLECTION_ID; } } else { - /* Rollback to original distribution when filter glossy is used. */ - distribution = distribution_orig; + /* If filter glossy is used we replace Sharp glossy with GGX so we can + * benefit from closure blur to remove unwanted noise. + */ + if(roughness_input->link == NULL && + distribution == CLOSURE_BSDF_REFLECTION_ID) + { + VLOG(1) << "Using GGX glossy with filter glossy."; + distribution = CLOSURE_BSDF_MICROFACET_GGX_ID; + roughness = 0.0f; + } } closure = distribution; } @@ -1953,7 +1971,8 @@ void GlossyBsdfNode::simplify_settings(Scene *scene) bool GlossyBsdfNode::has_integrator_dependency() { ShaderInput *roughness_input = input("Roughness"); - return !roughness_input->link && roughness <= 1e-4f; + return !roughness_input->link && + (distribution == CLOSURE_BSDF_REFLECTION_ID || roughness <= 1e-4f); } void GlossyBsdfNode::compile(SVMCompiler& compiler) @@ -2008,21 +2027,38 @@ GlassBsdfNode::GlassBsdfNode() void GlassBsdfNode::simplify_settings(Scene *scene) { if(distribution_orig == NBUILTIN_CLOSURES) { + roughness_orig = roughness; distribution_orig = distribution; } + else { + /* By default we use original values, so we don't worry about restoring + * defaults later one and can only do override when needed. + */ + roughness = roughness_orig; + distribution = distribution_orig; + } Integrator *integrator = scene->integrator; + ShaderInput *roughness_input = input("Roughness"); if(integrator->filter_glossy == 0.0f) { /* Fallback to Sharp closure for Roughness close to 0. * Note: Keep the epsilon in sync with kernel! */ - ShaderInput *roughness_input = input("Roughness"); if(!roughness_input->link && roughness <= 1e-4f) { + VLOG(1) << "Using sharp glass BSDF."; distribution = CLOSURE_BSDF_SHARP_GLASS_ID; } } else { - /* Rollback to original distribution when filter glossy is used. */ - distribution = distribution_orig; + /* If filter glossy is used we replace Sharp glossy with GGX so we can + * benefit from closure blur to remove unwanted noise. + */ + if(roughness_input->link == NULL && + distribution == CLOSURE_BSDF_SHARP_GLASS_ID) + { + VLOG(1) << "Using GGX glass with filter glossy."; + distribution = CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID; + roughness = 0.0f; + } } closure = distribution; } @@ -2030,7 +2066,8 @@ void GlassBsdfNode::simplify_settings(Scene *scene) bool GlassBsdfNode::has_integrator_dependency() { ShaderInput *roughness_input = input("Roughness"); - return !roughness_input->link && roughness <= 1e-4f; + return !roughness_input->link && + (distribution == CLOSURE_BSDF_SHARP_GLASS_ID || roughness <= 1e-4f); } void GlassBsdfNode::compile(SVMCompiler& compiler) @@ -2085,21 +2122,38 @@ RefractionBsdfNode::RefractionBsdfNode() void RefractionBsdfNode::simplify_settings(Scene *scene) { if(distribution_orig == NBUILTIN_CLOSURES) { + roughness_orig = roughness; distribution_orig = distribution; } + else { + /* By default we use original values, so we don't worry about restoring + * defaults later one and can only do override when needed. + */ + roughness = roughness_orig; + distribution = distribution_orig; + } Integrator *integrator = scene->integrator; + ShaderInput *roughness_input = input("Roughness"); if(integrator->filter_glossy == 0.0f) { /* Fallback to Sharp closure for Roughness close to 0. * Note: Keep the epsilon in sync with kernel! */ - ShaderInput *roughness_input = input("Roughness"); if(!roughness_input->link && roughness <= 1e-4f) { + VLOG(1) << "Using sharp refraction BSDF."; distribution = CLOSURE_BSDF_REFRACTION_ID; } } else { - /* Rollback to original distribution when filter glossy is used. */ - distribution = distribution_orig; + /* If filter glossy is used we replace Sharp glossy with GGX so we can + * benefit from closure blur to remove unwanted noise. + */ + if(roughness_input->link == NULL && + distribution == CLOSURE_BSDF_REFRACTION_ID) + { + VLOG(1) << "Using GGX refraction with filter glossy."; + distribution = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID; + roughness = 0.0f; + } } closure = distribution; } @@ -2107,7 +2161,8 @@ void RefractionBsdfNode::simplify_settings(Scene *scene) bool RefractionBsdfNode::has_integrator_dependency() { ShaderInput *roughness_input = input("Roughness"); - return !roughness_input->link && roughness <= 1e-4f; + return !roughness_input->link && + (distribution == CLOSURE_BSDF_REFRACTION_ID || roughness <= 1e-4f); } void RefractionBsdfNode::compile(SVMCompiler& compiler) diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h index 8d2df673688..d8023747860 100644 --- a/intern/cycles/render/nodes.h +++ b/intern/cycles/render/nodes.h @@ -17,10 +17,10 @@ #ifndef __NODES_H__ #define __NODES_H__ -#include "graph.h" -#include "node.h" +#include "render/graph.h" +#include "graph/node.h" -#include "util_string.h" +#include "util/util_string.h" CCL_NAMESPACE_BEGIN @@ -324,7 +324,7 @@ private: class BsdfNode : public ShaderNode { public: explicit BsdfNode(const NodeType *node_type); - SHADER_NODE_BASE_CLASS(BsdfNode); + SHADER_NODE_BASE_CLASS(BsdfNode) bool has_spatial_varying() { return true; } void compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3 = NULL, ShaderInput *param4 = NULL); @@ -421,7 +421,7 @@ public: bool has_integrator_dependency(); ClosureType get_closure_type() { return distribution; } - float roughness; + float roughness, roughness_orig; ClosureType distribution, distribution_orig; }; @@ -433,7 +433,7 @@ public: bool has_integrator_dependency(); ClosureType get_closure_type() { return distribution; } - float roughness, IOR; + float roughness, roughness_orig, IOR; ClosureType distribution, distribution_orig; }; @@ -445,7 +445,7 @@ public: bool has_integrator_dependency(); ClosureType get_closure_type() { return distribution; } - float roughness, IOR; + float roughness, roughness_orig, IOR; ClosureType distribution, distribution_orig; }; @@ -674,7 +674,7 @@ public: class MixClosureWeightNode : public ShaderNode { public: - SHADER_NODE_CLASS(MixClosureWeightNode); + SHADER_NODE_CLASS(MixClosureWeightNode) float weight; float fac; @@ -920,7 +920,7 @@ public: class CurvesNode : public ShaderNode { public: explicit CurvesNode(const NodeType *node_type); - SHADER_NODE_BASE_CLASS(CurvesNode); + SHADER_NODE_BASE_CLASS(CurvesNode) virtual int get_group() { return NODE_GROUP_LEVEL_3; } diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp index 8342f376836..375abfeb27a 100644 --- a/intern/cycles/render/object.cpp +++ b/intern/cycles/render/object.cpp @@ -14,22 +14,22 @@ * limitations under the License. */ -#include "camera.h" -#include "device.h" -#include "light.h" -#include "mesh.h" -#include "curves.h" -#include "object.h" -#include "particles.h" -#include "scene.h" - -#include "util_foreach.h" -#include "util_logging.h" -#include "util_map.h" -#include "util_progress.h" -#include "util_vector.h" - -#include "subd_patch_table.h" +#include "render/camera.h" +#include "device/device.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/curves.h" +#include "render/object.h" +#include "render/particles.h" +#include "render/scene.h" + +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_progress.h" +#include "util/util_vector.h" + +#include "subd/subd_patch_table.h" CCL_NAMESPACE_BEGIN @@ -49,6 +49,8 @@ NODE_DEFINE(Object) SOCKET_POINT(dupli_generated, "Dupli Generated", make_float3(0.0f, 0.0f, 0.0f)); SOCKET_POINT2(dupli_uv, "Dupli UV", make_float2(0.0f, 0.0f)); + SOCKET_BOOLEAN(is_shadow_catcher, "Shadow Catcher", false); + return type; } @@ -597,6 +599,12 @@ void ObjectManager::device_update_flags(Device *device, else { object_flag[object_index] &= ~SD_OBJECT_HAS_VOLUME; } + if(object->is_shadow_catcher) { + object_flag[object_index] |= SD_OBJECT_SHADOW_CATCHER; + } + else { + object_flag[object_index] &= ~SD_OBJECT_SHADOW_CATCHER; + } if(bounds_valid) { foreach(Object *volume_object, volume_objects) { diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h index 7e306fab2a8..12d7b2c81cf 100644 --- a/intern/cycles/render/object.h +++ b/intern/cycles/render/object.h @@ -17,14 +17,14 @@ #ifndef __OBJECT_H__ #define __OBJECT_H__ -#include "node.h" -#include "scene.h" +#include "graph/node.h" +#include "render/scene.h" -#include "util_boundbox.h" -#include "util_param.h" -#include "util_transform.h" -#include "util_thread.h" -#include "util_types.h" +#include "util/util_boundbox.h" +#include "util/util_param.h" +#include "util/util_transform.h" +#include "util/util_thread.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -40,7 +40,7 @@ struct Transform; class Object : public Node { public: - NODE_DECLARE; + NODE_DECLARE Mesh *mesh; Transform tfm; @@ -53,6 +53,7 @@ public: bool use_motion; bool hide_on_missing_motion; bool use_holdout; + bool is_shadow_catcher; float3 dupli_generated; float2 dupli_uv; diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp index 67b68e63cb2..6bff29d1c76 100644 --- a/intern/cycles/render/osl.cpp +++ b/intern/cycles/render/osl.cpp @@ -14,26 +14,26 @@ * limitations under the License. */ -#include "device.h" +#include "device/device.h" -#include "graph.h" -#include "light.h" -#include "osl.h" -#include "scene.h" -#include "shader.h" -#include "nodes.h" +#include "render/graph.h" +#include "render/light.h" +#include "render/osl.h" +#include "render/scene.h" +#include "render/shader.h" +#include "render/nodes.h" #ifdef WITH_OSL -#include "osl_globals.h" -#include "osl_services.h" -#include "osl_shader.h" +#include "kernel/osl/osl_globals.h" +#include "kernel/osl/osl_services.h" +#include "kernel/osl/osl_shader.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_md5.h" -#include "util_path.h" -#include "util_progress.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_md5.h" +#include "util/util_path.h" +#include "util/util_progress.h" #endif @@ -1096,12 +1096,10 @@ void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader) /* finalize */ shader->graph->finalize(scene, false, - true, shader->has_integrator_dependency); if(shader->graph_bump) { shader->graph_bump->finalize(scene, true, - true, shader->has_integrator_dependency, shader->displacement_method == DISPLACE_BOTH); } diff --git a/intern/cycles/render/osl.h b/intern/cycles/render/osl.h index b131b672b8c..2be1126fdd3 100644 --- a/intern/cycles/render/osl.h +++ b/intern/cycles/render/osl.h @@ -17,13 +17,13 @@ #ifndef __OSL_H__ #define __OSL_H__ -#include "util_set.h" -#include "util_string.h" -#include "util_thread.h" +#include "util/util_set.h" +#include "util/util_string.h" +#include "util/util_thread.h" -#include "graph.h" -#include "nodes.h" -#include "shader.h" +#include "render/graph.h" +#include "render/nodes.h" +#include "render/shader.h" #ifdef WITH_OSL #include <OSL/oslcomp.h> diff --git a/intern/cycles/render/particles.cpp b/intern/cycles/render/particles.cpp index 1a35d60fb4b..a51822a08be 100644 --- a/intern/cycles/render/particles.cpp +++ b/intern/cycles/render/particles.cpp @@ -14,15 +14,15 @@ * limitations under the License. */ -#include "device.h" -#include "particles.h" -#include "scene.h" - -#include "util_foreach.h" -#include "util_logging.h" -#include "util_map.h" -#include "util_progress.h" -#include "util_vector.h" +#include "device/device.h" +#include "render/particles.h" +#include "render/scene.h" + +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_progress.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/particles.h b/intern/cycles/render/particles.h index 2509e27b44b..66d46114b3e 100644 --- a/intern/cycles/render/particles.h +++ b/intern/cycles/render/particles.h @@ -17,8 +17,8 @@ #ifndef __PARTICLES_H__ #define __PARTICLES_H__ -#include "util_types.h" -#include "util_vector.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp index 68124e78cb5..4db20338744 100644 --- a/intern/cycles/render/scene.cpp +++ b/intern/cycles/render/scene.cpp @@ -16,27 +16,27 @@ #include <stdlib.h> -#include "background.h" -#include "bake.h" -#include "camera.h" -#include "curves.h" -#include "device.h" -#include "film.h" -#include "integrator.h" -#include "light.h" -#include "mesh.h" -#include "object.h" -#include "osl.h" -#include "particles.h" -#include "scene.h" -#include "shader.h" -#include "svm.h" -#include "tables.h" - -#include "util_foreach.h" -#include "util_guarded_allocator.h" -#include "util_logging.h" -#include "util_progress.h" +#include "render/background.h" +#include "render/bake.h" +#include "render/camera.h" +#include "render/curves.h" +#include "device/device.h" +#include "render/film.h" +#include "render/integrator.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/osl.h" +#include "render/particles.h" +#include "render/scene.h" +#include "render/shader.h" +#include "render/svm.h" +#include "render/tables.h" + +#include "util/util_foreach.h" +#include "util/util_guarded_allocator.h" +#include "util/util_logging.h" +#include "util/util_progress.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h index 8768682043f..2b5267642a2 100644 --- a/intern/cycles/render/scene.h +++ b/intern/cycles/render/scene.h @@ -17,18 +17,18 @@ #ifndef __SCENE_H__ #define __SCENE_H__ -#include "image.h" -#include "shader.h" +#include "render/image.h" +#include "render/shader.h" -#include "device_memory.h" +#include "device/device_memory.h" -#include "util_param.h" -#include "util_string.h" -#include "util_system.h" -#include "util_texture.h" -#include "util_thread.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_param.h" +#include "util/util_string.h" +#include "util/util_system.h" +#include "util/util_texture.h" +#include "util/util_thread.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -69,6 +69,7 @@ public: device_vector<uint> prim_visibility; device_vector<uint> prim_index; device_vector<uint> prim_object; + device_vector<float2> prim_time; /* mesh */ device_vector<uint> tri_shader; diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index 7c01934cfd8..c9b5547b407 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -17,24 +17,24 @@ #include <string.h> #include <limits.h> -#include "buffers.h" -#include "camera.h" -#include "device.h" -#include "graph.h" -#include "integrator.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "session.h" -#include "bake.h" - -#include "util_foreach.h" -#include "util_function.h" -#include "util_logging.h" -#include "util_math.h" -#include "util_opengl.h" -#include "util_task.h" -#include "util_time.h" +#include "render/buffers.h" +#include "render/camera.h" +#include "device/device.h" +#include "render/graph.h" +#include "render/integrator.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/session.h" +#include "render/bake.h" + +#include "util/util_foreach.h" +#include "util/util_function.h" +#include "util/util_logging.h" +#include "util/util_math.h" +#include "util/util_opengl.h" +#include "util/util_task.h" +#include "util/util_time.h" CCL_NAMESPACE_BEGIN @@ -230,7 +230,9 @@ void Session::run_gpu() while(1) { scoped_timer pause_timer; pause_cond.wait(pause_lock); - progress.add_skip_time(pause_timer, params.background); + if(pause) { + progress.add_skip_time(pause_timer, params.background); + } update_status_time(pause, no_tiles); progress.set_update(); @@ -520,7 +522,9 @@ void Session::run_cpu() while(1) { scoped_timer pause_timer; pause_cond.wait(pause_lock); - progress.add_skip_time(pause_timer, params.background); + if(pause) { + progress.add_skip_time(pause_timer, params.background); + } update_status_time(pause, no_tiles); progress.set_update(); @@ -633,6 +637,9 @@ DeviceRequestedFeatures Session::get_requested_device_features() requested_features.use_patch_evaluation = true; } #endif + if(object->is_shadow_catcher) { + requested_features.use_shadow_tricks = true; + } } BakeManager *bake_manager = scene->bake_manager; @@ -650,6 +657,8 @@ void Session::load_kernels() if(!kernels_loaded) { progress.set_status("Loading render kernels (may take a few minutes the first time)"); + scoped_timer timer; + DeviceRequestedFeatures requested_features = get_requested_device_features(); VLOG(2) << "Requested features:\n" << requested_features; if(!device->load_kernels(requested_features)) { @@ -663,6 +672,9 @@ void Session::load_kernels() return; } + progress.add_skip_time(timer, false); + VLOG(1) << "Total time spent loading kernels: " << time_dt() - timer.get_start(); + kernels_loaded = true; } } @@ -824,7 +836,7 @@ void Session::update_status_time(bool show_pause, bool show_done) int progressive_sample = tile_manager.state.sample; int num_samples = tile_manager.get_num_effective_samples(); - int tile = tile_manager.state.num_rendered_tiles; + int tile = progress.get_finished_tiles(); int num_tiles = tile_manager.state.num_tiles; /* update status */ @@ -832,7 +844,7 @@ void Session::update_status_time(bool show_pause, bool show_done) if(!params.progressive) { const bool is_cpu = params.device.type == DEVICE_CPU; - const bool is_last_tile = (progress.get_finished_tiles() + 1) == num_tiles; + const bool is_last_tile = (tile + 1) == num_tiles; substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles); @@ -883,6 +895,7 @@ void Session::path_trace() task.need_finish_queue = params.progressive_refine; task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH; task.requested_tile_size = params.tile_size; + task.passes_size = tile_manager.params.get_passes_size(); device->task_add(task); } diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index c7ff1446171..a7e5f78a64d 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -17,15 +17,15 @@ #ifndef __SESSION_H__ #define __SESSION_H__ -#include "buffers.h" -#include "device.h" -#include "shader.h" -#include "tile.h" - -#include "util_progress.h" -#include "util_stats.h" -#include "util_thread.h" -#include "util_vector.h" +#include "render/buffers.h" +#include "device/device.h" +#include "render/shader.h" +#include "render/tile.h" + +#include "util/util_progress.h" +#include "util/util_stats.h" +#include "util/util_thread.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp index 335edcbe609..12d3c6cf832 100644 --- a/intern/cycles/render/shader.cpp +++ b/intern/cycles/render/shader.cpp @@ -14,22 +14,22 @@ * limitations under the License. */ -#include "background.h" -#include "camera.h" -#include "device.h" -#include "graph.h" -#include "integrator.h" -#include "light.h" -#include "mesh.h" -#include "nodes.h" -#include "object.h" -#include "osl.h" -#include "scene.h" -#include "shader.h" -#include "svm.h" -#include "tables.h" - -#include "util_foreach.h" +#include "render/background.h" +#include "render/camera.h" +#include "device/device.h" +#include "render/graph.h" +#include "render/integrator.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/nodes.h" +#include "render/object.h" +#include "render/osl.h" +#include "render/scene.h" +#include "render/shader.h" +#include "render/svm.h" +#include "render/tables.h" + +#include "util/util_foreach.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h index 7d896652196..87fef19c592 100644 --- a/intern/cycles/render/shader.h +++ b/intern/cycles/render/shader.h @@ -19,20 +19,20 @@ #ifdef WITH_OSL /* So no context pollution happens from indirectly included windows.h */ -# include "util_windows.h" +# include "util/util_windows.h" # include <OSL/oslexec.h> #endif -#include "attribute.h" -#include "kernel_types.h" +#include "render/attribute.h" +#include "kernel/kernel_types.h" -#include "node.h" +#include "graph/node.h" -#include "util_map.h" -#include "util_param.h" -#include "util_string.h" -#include "util_thread.h" -#include "util_types.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_string.h" +#include "util/util_thread.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -82,7 +82,7 @@ enum DisplacementMethod { class Shader : public Node { public: - NODE_DECLARE; + NODE_DECLARE int pass_id; diff --git a/intern/cycles/render/sobol.cpp b/intern/cycles/render/sobol.cpp index e3c2e802067..ce93dc8c5d5 100644 --- a/intern/cycles/render/sobol.cpp +++ b/intern/cycles/render/sobol.cpp @@ -46,10 +46,10 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "util_debug.h" -#include "util_types.h" +#include "util/util_debug.h" +#include "util/util_types.h" -#include "sobol.h" +#include "render/sobol.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/sobol.h b/intern/cycles/render/sobol.h index 574f148b9a2..9fbce4e14a5 100644 --- a/intern/cycles/render/sobol.h +++ b/intern/cycles/render/sobol.h @@ -17,7 +17,7 @@ #ifndef __SOBOL_H__ #define __SOBOL_H__ -#include "util_types.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp index 955b892f4c3..4cb4018e2b4 100644 --- a/intern/cycles/render/svm.cpp +++ b/intern/cycles/render/svm.cpp @@ -14,20 +14,20 @@ * limitations under the License. */ -#include "device.h" -#include "graph.h" -#include "light.h" -#include "mesh.h" -#include "nodes.h" -#include "scene.h" -#include "shader.h" -#include "svm.h" - -#include "util_debug.h" -#include "util_logging.h" -#include "util_foreach.h" -#include "util_progress.h" -#include "util_task.h" +#include "device/device.h" +#include "render/graph.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/nodes.h" +#include "render/scene.h" +#include "render/shader.h" +#include "render/svm.h" + +#include "util/util_debug.h" +#include "util/util_logging.h" +#include "util/util_foreach.h" +#include "util/util_progress.h" +#include "util/util_task.h" CCL_NAMESPACE_BEGIN @@ -813,7 +813,6 @@ void SVMCompiler::compile(Scene *scene, scoped_timer timer((summary != NULL)? &summary->time_finalize: NULL); shader->graph->finalize(scene, false, - false, shader->has_integrator_dependency); } @@ -821,7 +820,6 @@ void SVMCompiler::compile(Scene *scene, scoped_timer timer((summary != NULL)? &summary->time_finalize_bump: NULL); shader->graph_bump->finalize(scene, true, - false, shader->has_integrator_dependency, shader->displacement_method == DISPLACE_BOTH); } diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h index a501b6bc8b1..abbd9e50610 100644 --- a/intern/cycles/render/svm.h +++ b/intern/cycles/render/svm.h @@ -17,13 +17,13 @@ #ifndef __SVM_H__ #define __SVM_H__ -#include "attribute.h" -#include "graph.h" -#include "shader.h" +#include "render/attribute.h" +#include "render/graph.h" +#include "render/shader.h" -#include "util_set.h" -#include "util_string.h" -#include "util_thread.h" +#include "util/util_set.h" +#include "util/util_string.h" +#include "util/util_thread.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/tables.cpp b/intern/cycles/render/tables.cpp index dfafd99961b..bf1ef12d602 100644 --- a/intern/cycles/render/tables.cpp +++ b/intern/cycles/render/tables.cpp @@ -14,12 +14,12 @@ * limitations under the License. */ -#include "device.h" -#include "scene.h" -#include "tables.h" +#include "device/device.h" +#include "render/scene.h" +#include "render/tables.h" -#include "util_debug.h" -#include "util_logging.h" +#include "util/util_debug.h" +#include "util/util_logging.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/tables.h b/intern/cycles/render/tables.h index 1bb70b22762..bc261c2a74d 100644 --- a/intern/cycles/render/tables.h +++ b/intern/cycles/render/tables.h @@ -17,7 +17,7 @@ #ifndef __TABLES_H__ #define __TABLES_H__ -#include <util_list.h> +#include "util/util_list.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp index a493c3fa1cd..944e746ca2d 100644 --- a/intern/cycles/render/tile.cpp +++ b/intern/cycles/render/tile.cpp @@ -14,10 +14,10 @@ * limitations under the License. */ -#include "tile.h" +#include "render/tile.h" -#include "util_algorithm.h" -#include "util_types.h" +#include "util/util_algorithm.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -131,7 +131,6 @@ void TileManager::reset(BufferParams& params_, int num_samples_) state.buffer = BufferParams(); state.sample = range_start_sample - 1; state.num_tiles = 0; - state.num_rendered_tiles = 0; state.num_samples = 0; state.resolution_divider = get_divider(params.width, params.height, start_resolution); state.tiles.clear(); @@ -343,7 +342,6 @@ bool TileManager::next_tile(Tile& tile, int device) tile = Tile(state.tiles[logical_device].front()); state.tiles[logical_device].pop_front(); - state.num_rendered_tiles++; return true; } diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h index 5d92ebac355..622b89f7670 100644 --- a/intern/cycles/render/tile.h +++ b/intern/cycles/render/tile.h @@ -19,8 +19,8 @@ #include <limits.h> -#include "buffers.h" -#include "util_list.h" +#include "render/buffers.h" +#include "util/util_list.h" CCL_NAMESPACE_BEGIN @@ -63,7 +63,6 @@ public: int num_samples; int resolution_divider; int num_tiles; - int num_rendered_tiles; /* Total samples over all pixels: Generally num_samples*num_pixels, * but can be higher due to the initial resolution division for previews. */ diff --git a/intern/cycles/subd/CMakeLists.txt b/intern/cycles/subd/CMakeLists.txt index dafb807bdf3..fe0c221ab0d 100644 --- a/intern/cycles/subd/CMakeLists.txt +++ b/intern/cycles/subd/CMakeLists.txt @@ -1,11 +1,6 @@ set(INC - . - ../graph - ../kernel - ../kernel/svm - ../render - ../util + .. ) set(INC_SYS diff --git a/intern/cycles/subd/subd_dice.cpp b/intern/cycles/subd/subd_dice.cpp index a1bd349b167..fae815901ee 100644 --- a/intern/cycles/subd/subd_dice.cpp +++ b/intern/cycles/subd/subd_dice.cpp @@ -14,13 +14,13 @@ * limitations under the License. */ -#include "camera.h" -#include "mesh.h" +#include "render/camera.h" +#include "render/mesh.h" -#include "subd_dice.h" -#include "subd_patch.h" +#include "subd/subd_dice.h" +#include "subd/subd_patch.h" -#include "util_debug.h" +#include "util/util_debug.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/subd/subd_dice.h b/intern/cycles/subd/subd_dice.h index 33d13a4ab3a..c0e32be18c4 100644 --- a/intern/cycles/subd/subd_dice.h +++ b/intern/cycles/subd/subd_dice.h @@ -22,8 +22,8 @@ * DiagSplit. For more algorithm details, see the DiagSplit paper or the * ARB_tessellation_shader OpenGL extension, Section 2.X.2. */ -#include "util_types.h" -#include "util_vector.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/subd/subd_patch.cpp b/intern/cycles/subd/subd_patch.cpp index d3319c5ccf5..fa2fe2bf113 100644 --- a/intern/cycles/subd/subd_patch.cpp +++ b/intern/cycles/subd/subd_patch.cpp @@ -16,12 +16,12 @@ /* Parts adapted from code in the public domain in NVidia Mesh Tools. */ -#include "mesh.h" +#include "render/mesh.h" -#include "subd_patch.h" +#include "subd/subd_patch.h" -#include "util_math.h" -#include "util_types.h" +#include "util/util_math.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/subd/subd_patch.h b/intern/cycles/subd/subd_patch.h index 360c1abf27b..1bb81588835 100644 --- a/intern/cycles/subd/subd_patch.h +++ b/intern/cycles/subd/subd_patch.h @@ -17,8 +17,8 @@ #ifndef __SUBD_PATCH_H__ #define __SUBD_PATCH_H__ -#include "util_boundbox.h" -#include "util_types.h" +#include "util/util_boundbox.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/subd/subd_patch_table.cpp b/intern/cycles/subd/subd_patch_table.cpp index d437b045c07..63bf673a90b 100644 --- a/intern/cycles/subd/subd_patch_table.cpp +++ b/intern/cycles/subd/subd_patch_table.cpp @@ -25,10 +25,10 @@ * */ -#include "subd_patch_table.h" -#include "kernel_types.h" +#include "subd/subd_patch_table.h" +#include "kernel/kernel_types.h" -#include "util_math.h" +#include "util/util_math.h" #ifdef WITH_OPENSUBDIV #include <opensubdiv/far/patchTable.h> diff --git a/intern/cycles/subd/subd_patch_table.h b/intern/cycles/subd/subd_patch_table.h index 3166a1691d8..907f2dd6c28 100644 --- a/intern/cycles/subd/subd_patch_table.h +++ b/intern/cycles/subd/subd_patch_table.h @@ -17,8 +17,8 @@ #ifndef __SUBD_PATCH_TABLE_H__ #define __SUBD_PATCH_TABLE_H__ -#include "util_types.h" -#include "util_vector.h" +#include "util/util_types.h" +#include "util/util_vector.h" #ifdef WITH_OPENSUBDIV #ifdef _MSC_VER diff --git a/intern/cycles/subd/subd_split.cpp b/intern/cycles/subd/subd_split.cpp index 3c91ad8ab0d..9dbfc1c4e2f 100644 --- a/intern/cycles/subd/subd_split.cpp +++ b/intern/cycles/subd/subd_split.cpp @@ -14,16 +14,16 @@ * limitations under the License. */ -#include "camera.h" -#include "mesh.h" +#include "render/camera.h" +#include "render/mesh.h" -#include "subd_dice.h" -#include "subd_patch.h" -#include "subd_split.h" +#include "subd/subd_dice.h" +#include "subd/subd_patch.h" +#include "subd/subd_split.h" -#include "util_debug.h" -#include "util_math.h" -#include "util_types.h" +#include "util/util_debug.h" +#include "util/util_math.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/subd/subd_split.h b/intern/cycles/subd/subd_split.h index a2f76dd2e03..f869cc6a48e 100644 --- a/intern/cycles/subd/subd_split.h +++ b/intern/cycles/subd/subd_split.h @@ -22,10 +22,10 @@ * evaluation at arbitrary points is required for this to work. See the paper * for more details. */ -#include "subd_dice.h" +#include "subd/subd_dice.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index d8abf671bd6..a015fef8284 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -1,6 +1,6 @@ set(INC - . + .. ../../glew-mx ) @@ -52,6 +52,7 @@ set(SRC_HEADERS util_math.h util_math_cdf.h util_math_fast.h + util_math_intersect.h util_md5.h util_opengl.h util_optimization.h diff --git a/intern/cycles/util/util_aligned_malloc.cpp b/intern/cycles/util/util_aligned_malloc.cpp index 15d2eb3271b..cc7252dcc58 100644 --- a/intern/cycles/util/util_aligned_malloc.cpp +++ b/intern/cycles/util/util_aligned_malloc.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "util_aligned_malloc.h" -#include "util_guarded_allocator.h" +#include "util/util_aligned_malloc.h" +#include "util/util_guarded_allocator.h" #include <cassert> diff --git a/intern/cycles/util/util_aligned_malloc.h b/intern/cycles/util/util_aligned_malloc.h index ecc0f28c376..cf1e86ca916 100644 --- a/intern/cycles/util/util_aligned_malloc.h +++ b/intern/cycles/util/util_aligned_malloc.h @@ -17,7 +17,7 @@ #ifndef __UTIL_ALIGNED_MALLOC_H__ #define __UTIL_ALIGNED_MALLOC_H__ -#include "util_types.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h index 433e41fbbb6..6c52117ef9a 100644 --- a/intern/cycles/util/util_atomic.h +++ b/intern/cycles/util/util_atomic.h @@ -32,6 +32,13 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value) } } +#define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x)) + +#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1) + +#define CCL_LOCAL_MEM_FENCE 0 +#define ccl_barrier(flags) (void)0 + #else /* __KERNEL_GPU__ */ #ifdef __KERNEL_OPENCL__ @@ -39,7 +46,7 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value) /* Float atomics implementation credits: * http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html */ -ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *source, +ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *source, const float operand) { union { @@ -56,10 +63,29 @@ ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *sou } while(atomic_cmpxchg((volatile ccl_global unsigned int *)source, prev_value.int_value, new_value.int_value) != prev_value.int_value); + return new_value.float_value; } +#define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x)) +#define atomic_fetch_and_inc_uint32(p) atomic_inc((p)) + +#define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE +#define ccl_barrier(flags) barrier(flags) + #endif /* __KERNEL_OPENCL__ */ +#ifdef __KERNEL_CUDA__ + +#define atomic_add_and_fetch_float(p, x) (atomicAdd((float*)(p), (float)(x)) + (float)(x)) + +#define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int*)(p), (unsigned int)(x)) +#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1) + +#define CCL_LOCAL_MEM_FENCE +#define ccl_barrier(flags) __syncthreads() + +#endif /* __KERNEL_CUDA__ */ + #endif /* __KERNEL_GPU__ */ #endif /* __UTIL_ATOMIC_H__ */ diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h index dfe4977aef3..ed94ca20211 100644 --- a/intern/cycles/util/util_boundbox.h +++ b/intern/cycles/util/util_boundbox.h @@ -20,10 +20,10 @@ #include <math.h> #include <float.h> -#include "util_math.h" -#include "util_string.h" -#include "util_transform.h" -#include "util_types.h" +#include "util/util_math.h" +#include "util/util_string.h" +#include "util/util_transform.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h index d3598f84b94..4d673dc34d8 100644 --- a/intern/cycles/util/util_color.h +++ b/intern/cycles/util/util_color.h @@ -17,11 +17,11 @@ #ifndef __UTIL_COLOR_H__ #define __UTIL_COLOR_H__ -#include "util_math.h" -#include "util_types.h" +#include "util/util_math.h" +#include "util/util_types.h" #ifdef __KERNEL_SSE2__ -#include "util_simd.h" +#include "util/util_simd.h" #endif CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index 80d177d2cae..9cfa57dd741 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -14,12 +14,12 @@ * limitations under the License. */ -#include "util_debug.h" +#include "util/util_debug.h" #include <stdlib.h> -#include "util_logging.h" -#include "util_string.h" +#include "util/util_logging.h" +#include "util/util_string.h" CCL_NAMESPACE_BEGIN @@ -29,7 +29,8 @@ DebugFlags::CPU::CPU() sse41(true), sse3(true), sse2(true), - qbvh(true) + qbvh(true), + split_kernel(false) { reset(); } @@ -55,10 +56,12 @@ void DebugFlags::CPU::reset() #undef CHECK_CPU_FLAGS qbvh = true; + split_kernel = false; } DebugFlags::CUDA::CUDA() - : adaptive_compile(false) + : adaptive_compile(false), + split_kernel(false) { reset(); } @@ -67,12 +70,15 @@ void DebugFlags::CUDA::reset() { if(getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL) adaptive_compile = true; + + split_kernel = false; } DebugFlags::OpenCL::OpenCL() : device_type(DebugFlags::OpenCL::DEVICE_ALL), kernel_type(DebugFlags::OpenCL::KERNEL_DEFAULT), - debug(false) + debug(false), + single_program(false) { reset(); } @@ -112,6 +118,7 @@ void DebugFlags::OpenCL::reset() } /* Initialize other flags from environment variables. */ debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL); + single_program = (getenv("CYCLES_OPENCL_SINGLE_PROGRAM") != NULL); } DebugFlags::DebugFlags() @@ -133,7 +140,9 @@ std::ostream& operator <<(std::ostream &os, << " AVX : " << string_from_bool(debug_flags.cpu.avx) << "\n" << " SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n" << " SSE3 : " << string_from_bool(debug_flags.cpu.sse3) << "\n" - << " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n"; + << " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n" + << " QBVH : " << string_from_bool(debug_flags.cpu.qbvh) << "\n" + << " Split : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n"; os << "CUDA flags:\n" << " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n"; @@ -172,9 +181,10 @@ std::ostream& operator <<(std::ostream &os, break; } os << "OpenCL flags:\n" - << " Device type : " << opencl_device_type << "\n" - << " Kernel type : " << opencl_kernel_type << "\n" - << " Debug : " << string_from_bool(debug_flags.opencl.debug) + << " Device type : " << opencl_device_type << "\n" + << " Kernel type : " << opencl_kernel_type << "\n" + << " Debug : " << string_from_bool(debug_flags.opencl.debug) << "\n" + << " Signle program : " << string_from_bool(debug_flags.opencl.single_program) << "\n"; return os; } diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index 73fd228b5d9..4505d584490 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -20,7 +20,7 @@ #include <cassert> #include <iostream> -#include "util_static_assert.h" +#include "util/util_static_assert.h" CCL_NAMESPACE_BEGIN @@ -46,6 +46,9 @@ public: /* Whether QBVH usage is allowed or not. */ bool qbvh; + + /* Whether split kernel is used */ + bool split_kernel; }; /* Descriptor of CUDA feature-set to be used. */ @@ -58,6 +61,9 @@ public: /* Whether adaptive feature based runtime compile is enabled or not. * Requires the CUDA Toolkit and only works on Linux atm. */ bool adaptive_compile; + + /* Whether split kernel is used */ + bool split_kernel; }; /* Descriptor of OpenCL feature-set to be used. */ @@ -106,6 +112,9 @@ public: /* Use debug version of the kernel. */ bool debug; + + /* Use single program */ + bool single_program; }; /* Get instance of debug flags registry. */ diff --git a/intern/cycles/util/util_guarded_allocator.cpp b/intern/cycles/util/util_guarded_allocator.cpp index 615ac95f324..54fa6a80df5 100644 --- a/intern/cycles/util/util_guarded_allocator.cpp +++ b/intern/cycles/util/util_guarded_allocator.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "util_guarded_allocator.h" -#include "util_stats.h" +#include "util/util_guarded_allocator.h" +#include "util/util_stats.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_guarded_allocator.h b/intern/cycles/util/util_guarded_allocator.h index 78453d214be..5f9dcfb2481 100644 --- a/intern/cycles/util/util_guarded_allocator.h +++ b/intern/cycles/util/util_guarded_allocator.h @@ -20,8 +20,8 @@ #include <cstddef> #include <memory> -#include "util_debug.h" -#include "util_types.h" +#include "util/util_debug.h" +#include "util/util_types.h" #ifdef WITH_BLENDER_GUARDEDALLOC # include "../../guardedalloc/MEM_guardedalloc.h" diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h index 5db3384cda4..612228dd1c1 100644 --- a/intern/cycles/util/util_half.h +++ b/intern/cycles/util/util_half.h @@ -17,10 +17,11 @@ #ifndef __UTIL_HALF_H__ #define __UTIL_HALF_H__ -#include "util_types.h" +#include "util/util_types.h" +#include "util/util_math.h" #ifdef __KERNEL_SSE2__ -#include "util_simd.h" +#include "util/util_simd.h" #endif CCL_NAMESPACE_BEGIN @@ -110,6 +111,28 @@ ccl_device_inline float4 half4_to_float4(half4 h) return f; } +ccl_device_inline half float_to_half(float f) +{ + const uint u = __float_as_uint(f); + /* Sign bit, shifted to it's position. */ + uint sign_bit = u & 0x80000000; + sign_bit >>= 16; + /* Exponent. */ + uint exponent_bits = u & 0x7f800000; + /* Non-sign bits. */ + uint value_bits = u & 0x7fffffff; + value_bits >>= 13; /* Align mantissa on MSB. */ + value_bits -= 0x1c000; /* Adjust bias. */ + /* Flush-to-zero. */ + value_bits = (exponent_bits < 0x38800000) ? 0 : value_bits; + /* Clamp-to-max. */ + value_bits = (exponent_bits > 0x47000000) ? 0x7bff : value_bits; + /* Denormals-as-zero. */ + value_bits = (exponent_bits == 0 ? 0 : value_bits); + /* Re-insert sign bit and return. */ + return (value_bits | sign_bit); +} + #endif #endif diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h index 98c3a681ff2..a30b7fe288e 100644 --- a/intern/cycles/util/util_hash.h +++ b/intern/cycles/util/util_hash.h @@ -17,7 +17,7 @@ #ifndef __UTIL_HASH_H__ #define __UTIL_HASH_H__ -#include "util_types.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_image.h b/intern/cycles/util/util_image.h index c8efc551d97..18876841b5b 100644 --- a/intern/cycles/util/util_image.h +++ b/intern/cycles/util/util_image.h @@ -21,7 +21,7 @@ #include <OpenImageIO/imageio.h> -#include "util_vector.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -42,4 +42,4 @@ CCL_NAMESPACE_END #endif /* __UTIL_IMAGE_H__ */ -#include "util_image_impl.h" +#include "util/util_image_impl.h" diff --git a/intern/cycles/util/util_image_impl.h b/intern/cycles/util/util_image_impl.h index 73ecfda0855..a0f9c66f979 100644 --- a/intern/cycles/util/util_image_impl.h +++ b/intern/cycles/util/util_image_impl.h @@ -17,9 +17,10 @@ #ifndef __UTIL_IMAGE_IMPL_H__ #define __UTIL_IMAGE_IMPL_H__ -#include "util_algorithm.h" -#include "util_debug.h" -#include "util_image.h" +#include "util/util_algorithm.h" +#include "util/util_debug.h" +#include "util/util_half.h" +#include "util/util_image.h" CCL_NAMESPACE_BEGIN @@ -38,6 +39,52 @@ const T *util_image_read(const vector<T>& pixels, return &pixels[index]; } +/* Cast input pixel from unknown storage to float. */ +template<typename T> +inline float cast_to_float(T value); + +template<> +inline float cast_to_float(float value) +{ + return value; +} +template<> +inline float cast_to_float(uchar value) +{ + return (float)value / 255.0f; +} +template<> +inline float cast_to_float(half value) +{ + return half_to_float(value); +} + +/* Cast float value to output pixel type. */ +template<typename T> +inline T cast_from_float(float value); + +template<> +inline float cast_from_float(float value) +{ + return value; +} +template<> +inline uchar cast_from_float(float value) +{ + if(value < 0.0f) { + return 0; + } + else if(value > (1.0f - 0.5f / 255.0f)) { + return 255; + } + return (uchar)((255.0f * value) + 0.5f); +} +template<> +inline half cast_from_float(float value) +{ + return float_to_half(value); +} + template<typename T> void util_image_downscale_sample(const vector<T>& pixels, const size_t width, @@ -71,15 +118,22 @@ void util_image_downscale_sample(const vector<T>& pixels, components, nx, ny, nz); for(size_t k = 0; k < components; ++k) { - accum[k] += pixel[k]; + accum[k] += cast_to_float(pixel[k]); } ++count; } } } - const float inv_count = 1.0f / (float)count; - for(size_t k = 0; k < components; ++k) { - result[k] = T(accum[k] * inv_count); + if(count != 0) { + const float inv_count = 1.0f / (float)count; + for(size_t k = 0; k < components; ++k) { + result[k] = cast_from_float<T>(accum[k] * inv_count); + } + } + else { + for(size_t k = 0; k < components; ++k) { + result[k] = T(0.0f); + } } } diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp index 03041723e15..a5a3bd34fff 100644 --- a/intern/cycles/util/util_logging.cpp +++ b/intern/cycles/util/util_logging.cpp @@ -14,9 +14,9 @@ * limitations under the License. */ -#include <util_logging.h> +#include "util/util_logging.h" -#include "util_math.h" +#include "util/util_math.h" #include <stdio.h> #ifdef _MSC_VER @@ -69,6 +69,15 @@ void util_logging_verbosity_set(int verbosity) } std::ostream& operator <<(std::ostream &os, + const int2 &value) +{ + os << "(" << value.x + << ", " << value.y + << ")"; + return os; +} + +std::ostream& operator <<(std::ostream &os, const float3 &value) { os << "(" << value.x diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h index 2aa9c25b1a0..ecf9c9cfee0 100644 --- a/intern/cycles/util/util_logging.h +++ b/intern/cycles/util/util_logging.h @@ -45,6 +45,7 @@ public: #define VLOG_ONCE(level, flag) if(!flag) flag = true, VLOG(level) +struct int2; struct float3; void util_logging_init(const char *argv0); @@ -52,6 +53,8 @@ void util_logging_start(void); void util_logging_verbosity_set(int verbosity); std::ostream& operator <<(std::ostream &os, + const int2 &value); +std::ostream& operator <<(std::ostream &os, const float3 &value); CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 2b81c8c498a..e0305b978b9 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -35,7 +35,7 @@ #endif -#include "util_types.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -43,41 +43,41 @@ CCL_NAMESPACE_BEGIN /* Division */ #ifndef M_PI_F -#define M_PI_F ((float)3.14159265358979323846264338327950288) /* pi */ +#define M_PI_F (3.1415926535897932f) /* pi */ #endif #ifndef M_PI_2_F -#define M_PI_2_F ((float)1.57079632679489661923132169163975144) /* pi/2 */ +#define M_PI_2_F (1.5707963267948966f) /* pi/2 */ #endif #ifndef M_PI_4_F -#define M_PI_4_F ((float)0.785398163397448309615660845819875721) /* pi/4 */ +#define M_PI_4_F (0.7853981633974830f) /* pi/4 */ #endif #ifndef M_1_PI_F -#define M_1_PI_F ((float)0.318309886183790671537767526745028724) /* 1/pi */ +#define M_1_PI_F (0.3183098861837067f) /* 1/pi */ #endif #ifndef M_2_PI_F -#define M_2_PI_F ((float)0.636619772367581343075535053490057448) /* 2/pi */ +#define M_2_PI_F (0.6366197723675813f) /* 2/pi */ #endif /* Multiplication */ #ifndef M_2PI_F -#define M_2PI_F ((float)6.283185307179586476925286766559005768) /* 2*pi */ +#define M_2PI_F (6.2831853071795864f) /* 2*pi */ #endif #ifndef M_4PI_F -#define M_4PI_F ((float)12.56637061435917295385057353311801153) /* 4*pi */ +#define M_4PI_F (12.566370614359172f) /* 4*pi */ #endif /* Float sqrt variations */ #ifndef M_SQRT2_F -#define M_SQRT2_F ((float)1.41421356237309504880) /* sqrt(2) */ +#define M_SQRT2_F (1.4142135623730950f) /* sqrt(2) */ #endif #ifndef M_LN2_F -#define M_LN2_F ((float)0.6931471805599453) /* ln(2) */ +#define M_LN2_F (0.6931471805599453f) /* ln(2) */ #endif #ifndef M_LN10_F -#define M_LN10_F ((float)2.3025850929940457) /* ln(10) */ +#define M_LN10_F (2.3025850929940457f) /* ln(10) */ #endif /* Scalar */ @@ -774,6 +774,7 @@ template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __force return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))); } +#if defined(__KERNEL_SSE3__) template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b) { return _mm_moveldup_ps(b); @@ -783,6 +784,7 @@ template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b) { return _mm_movehdup_ps(b); } +#endif template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b) { @@ -1241,19 +1243,6 @@ ccl_device_inline float __uint_as_float(uint i) return u.f; } -/* Versions of functions which are safe for fast math. */ -ccl_device_inline bool isnan_safe(float f) -{ - unsigned int x = __float_as_uint(f); - return (x << 1) > 0xff000000u; -} - -ccl_device_inline bool isfinite_safe(float f) -{ - /* By IEEE 754 rule, 2*Inf equals Inf */ - unsigned int x = __float_as_uint(f); - return (f == f) && (x == 0 || (f != 2.0f*f)); -} /* Interpolation */ @@ -1271,6 +1260,20 @@ ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const #endif +/* Versions of functions which are safe for fast math. */ +ccl_device_inline bool isnan_safe(float f) +{ + unsigned int x = __float_as_uint(f); + return (x << 1) > 0xff000000u; +} + +ccl_device_inline bool isfinite_safe(float f) +{ + /* By IEEE 754 rule, 2*Inf equals Inf */ + unsigned int x = __float_as_uint(f); + return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u); +} + /* Orthonormal vectors */ ccl_device_inline void make_orthonormals(const float3 N, float3 *a, float3 *b) @@ -1329,7 +1332,7 @@ ccl_device_inline float3 safe_divide_even_color(float3 a, float3 b) y = (b.y != 0.0f)? a.y/b.y: 0.0f; z = (b.z != 0.0f)? a.z/b.z: 0.0f; - /* try to get grey even if b is zero */ + /* try to get gray even if b is zero */ if(b.x == 0.0f) { if(b.y == 0.0f) { x = z; @@ -1451,181 +1454,9 @@ ccl_device_inline float beta(float x, float y) #endif } -/* Ray Intersection */ - -ccl_device bool ray_sphere_intersect( - float3 ray_P, float3 ray_D, float ray_t, - float3 sphere_P, float sphere_radius, - float3 *isect_P, float *isect_t) -{ - float3 d = sphere_P - ray_P; - float radiussq = sphere_radius*sphere_radius; - float tsq = dot(d, d); - - if(tsq > radiussq) { /* ray origin outside sphere */ - float tp = dot(d, ray_D); - - if(tp < 0.0f) /* dir points away from sphere */ - return false; - - float dsq = tsq - tp*tp; /* pythagoras */ - - if(dsq > radiussq) /* closest point on ray outside sphere */ - return false; - - float t = tp - sqrtf(radiussq - dsq); /* pythagoras */ - - if(t < ray_t) { - *isect_t = t; - *isect_P = ray_P + ray_D*t; - return true; - } - } - - return false; -} - -ccl_device bool ray_aligned_disk_intersect( - float3 ray_P, float3 ray_D, float ray_t, - float3 disk_P, float disk_radius, - float3 *isect_P, float *isect_t) +ccl_device_inline float xor_signmask(float x, int y) { - /* aligned disk normal */ - float disk_t; - float3 disk_N = normalize_len(ray_P - disk_P, &disk_t); - float div = dot(ray_D, disk_N); - - if(UNLIKELY(div == 0.0f)) - return false; - - /* compute t to intersection point */ - float t = -disk_t/div; - if(t < 0.0f || t > ray_t) - return false; - - /* test if within radius */ - float3 P = ray_P + ray_D*t; - if(len_squared(P - disk_P) > disk_radius*disk_radius) - return false; - - *isect_P = P; - *isect_t = t; - - return true; -} - -ccl_device bool ray_triangle_intersect( - float3 ray_P, float3 ray_D, float ray_t, - float3 v0, float3 v1, float3 v2, - float3 *isect_P, float *isect_t) -{ - /* Calculate intersection */ - float3 e1 = v1 - v0; - float3 e2 = v2 - v0; - float3 s1 = cross(ray_D, e2); - - const float divisor = dot(s1, e1); - if(UNLIKELY(divisor == 0.0f)) - return false; - - const float invdivisor = 1.0f/divisor; - - /* compute first barycentric coordinate */ - const float3 d = ray_P - v0; - const float u = dot(d, s1)*invdivisor; - if(u < 0.0f) - return false; - - /* Compute second barycentric coordinate */ - const float3 s2 = cross(d, e1); - const float v = dot(ray_D, s2)*invdivisor; - if(v < 0.0f) - return false; - - const float b0 = 1.0f - u - v; - if(b0 < 0.0f) - return false; - - /* compute t to intersection point */ - const float t = dot(e2, s2)*invdivisor; - if(t < 0.0f || t > ray_t) - return false; - - *isect_t = t; - *isect_P = ray_P + ray_D*t; - - return true; -} - -ccl_device_inline bool ray_triangle_intersect_uv( - float3 ray_P, float3 ray_D, float ray_t, - float3 v0, float3 v1, float3 v2, - float *isect_u, float *isect_v, float *isect_t) -{ - /* Calculate intersection */ - float3 e1 = v1 - v0; - float3 e2 = v2 - v0; - float3 s1 = cross(ray_D, e2); - - const float divisor = dot(s1, e1); - if(UNLIKELY(divisor == 0.0f)) - return false; - - const float invdivisor = 1.0f/divisor; - - /* compute first barycentric coordinate */ - const float3 d = ray_P - v0; - const float u = dot(d, s1)*invdivisor; - if(u < 0.0f) - return false; - - /* Compute second barycentric coordinate */ - const float3 s2 = cross(d, e1); - const float v = dot(ray_D, s2)*invdivisor; - if(v < 0.0f) - return false; - - const float b0 = 1.0f - u - v; - if(b0 < 0.0f) - return false; - - /* compute t to intersection point */ - const float t = dot(e2, s2)*invdivisor; - if(t < 0.0f || t > ray_t) - return false; - - *isect_u = u; - *isect_v = v; - *isect_t = t; - - return true; -} - -ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, float ray_mint, float ray_maxt, - float3 quad_P, float3 quad_u, float3 quad_v, float3 quad_n, - float3 *isect_P, float *isect_t, float *isect_u, float *isect_v) -{ - float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n); - if(t < ray_mint || t > ray_maxt) - return false; - - float3 hit = ray_P + t*ray_D; - float3 inplane = hit - quad_P; - - float u = dot(inplane, quad_u) / dot(quad_u, quad_u) + 0.5f; - if(u < 0.0f || u > 1.0f) - return false; - - float v = dot(inplane, quad_v) / dot(quad_v, quad_v) + 0.5f; - if(v < 0.0f || v > 1.0f) - return false; - - if(isect_P) *isect_P = hit; - if(isect_t) *isect_t = t; - if(isect_u) *isect_u = u; - if(isect_v) *isect_v = v; - - return true; + return __int_as_float(__float_as_int(x) ^ y); } /* projections */ @@ -1690,4 +1521,3 @@ ccl_device_inline int util_max_axis(float3 vec) CCL_NAMESPACE_END #endif /* __UTIL_MATH_H__ */ - diff --git a/intern/cycles/util/util_math_cdf.cpp b/intern/cycles/util/util_math_cdf.cpp index ec78ca15d88..c14d4793ea1 100644 --- a/intern/cycles/util/util_math_cdf.cpp +++ b/intern/cycles/util/util_math_cdf.cpp @@ -14,10 +14,10 @@ * limitations under the License. */ -#include "util_math_cdf.h" +#include "util/util_math_cdf.h" -#include "util_algorithm.h" -#include "util_math.h" +#include "util/util_algorithm.h" +#include "util/util_math.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_math_cdf.h b/intern/cycles/util/util_math_cdf.h index 47dfb68ba44..79643fe26e3 100644 --- a/intern/cycles/util/util_math_cdf.h +++ b/intern/cycles/util/util_math_cdf.h @@ -17,9 +17,9 @@ #ifndef __UTIL_MATH_CDF_H__ #define __UTIL_MATH_CDF_H__ -#include "util_algorithm.h" -#include "util_math.h" -#include "util_vector.h" +#include "util/util_algorithm.h" +#include "util/util_math.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_math_intersect.h b/intern/cycles/util/util_math_intersect.h new file mode 100644 index 00000000000..2b65a0dfa48 --- /dev/null +++ b/intern/cycles/util/util_math_intersect.h @@ -0,0 +1,221 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_INTERSECT_H__ +#define __UTIL_MATH_INTERSECT_H__ + +CCL_NAMESPACE_BEGIN + +/* Ray Intersection */ + +ccl_device bool ray_sphere_intersect( + float3 ray_P, float3 ray_D, float ray_t, + float3 sphere_P, float sphere_radius, + float3 *isect_P, float *isect_t) +{ + const float3 d = sphere_P - ray_P; + const float radiussq = sphere_radius*sphere_radius; + const float tsq = dot(d, d); + + if(tsq > radiussq) { + /* Ray origin outside sphere. */ + const float tp = dot(d, ray_D); + if(tp < 0.0f) { + /* Ray points away from sphere. */ + return false; + } + const float dsq = tsq - tp*tp; /* pythagoras */ + if(dsq > radiussq) { + /* Closest point on ray outside sphere. */ + return false; + } + const float t = tp - sqrtf(radiussq - dsq); /* pythagoras */ + if(t < ray_t) { + *isect_t = t; + *isect_P = ray_P + ray_D*t; + return true; + } + } + return false; +} + +ccl_device bool ray_aligned_disk_intersect( + float3 ray_P, float3 ray_D, float ray_t, + float3 disk_P, float disk_radius, + float3 *isect_P, float *isect_t) +{ + /* Aligned disk normal. */ + float disk_t; + const float3 disk_N = normalize_len(ray_P - disk_P, &disk_t); + const float div = dot(ray_D, disk_N); + if(UNLIKELY(div == 0.0f)) { + return false; + } + /* Compute t to intersection point. */ + const float t = -disk_t/div; + if(t < 0.0f || t > ray_t) { + return false; + } + /* Test if within radius. */ + float3 P = ray_P + ray_D*t; + if(len_squared(P - disk_P) > disk_radius*disk_radius) { + return false; + } + *isect_P = P; + *isect_t = t; + return true; +} + +#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300 +ccl_device_inline +#else +ccl_device_forceinline +#endif +bool ray_triangle_intersect( + float3 ray_P, float3 ray_dir, float ray_t, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + const ssef *ssef_verts, +#else + const float3 tri_a, const float3 tri_b, const float3 tri_c, +#endif + float *isect_u, float *isect_v, float *isect_t) +{ +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + typedef ssef float3; + const float3 tri_a(ssef_verts[0]); + const float3 tri_b(ssef_verts[1]); + const float3 tri_c(ssef_verts[2]); + const float3 P(ray_P); + const float3 dir(ray_dir); +#else +# define dot3(a, b) dot(a, b) + const float3 P = ray_P; + const float3 dir = ray_dir; +#endif + + /* Calculate vertices relative to ray origin. */ + const float3 v0 = tri_c - P; + const float3 v1 = tri_a - P; + const float3 v2 = tri_b - P; + + /* Calculate triangle edges. */ + const float3 e0 = v2 - v0; + const float3 e1 = v0 - v1; + const float3 e2 = v1 - v2; + + /* Perform edge tests. */ +#ifdef __KERNEL_SSE2__ + const float3 crossU = cross(v2 + v0, e0); + const float3 crossV = cross(v0 + v1, e1); + const float3 crossW = cross(v1 + v2, e2); +# ifndef __KERNEL_SSE__ + const ssef crossX(crossU.x, crossV.x, crossW.x, crossW.x); + const ssef crossY(crossU.y, crossV.y, crossW.y, crossW.y); + const ssef crossZ(crossU.z, crossV.z, crossW.z, crossW.z); +# else + ssef crossX(crossU); + ssef crossY(crossV); + ssef crossZ(crossW); + ssef zero = _mm_setzero_ps(); + _MM_TRANSPOSE4_PS(crossX, crossY, crossZ, zero); +# endif + const ssef dirX(ray_dir.x); + const ssef dirY(ray_dir.y); + const ssef dirZ(ray_dir.z); + /*const*/ ssef UVWW = crossX*dirX + crossY*dirY + crossZ*dirZ; + const float minUVW = reduce_min(UVWW); + const float maxUVW = reduce_max(UVWW); +#else /* __KERNEL_SSE2__ */ + const float U = dot(cross(v2 + v0, e0), ray_dir); + const float V = dot(cross(v0 + v1, e1), ray_dir); + const float W = dot(cross(v1 + v2, e2), ray_dir); + const float minUVW = min(U, min(V, W)); + const float maxUVW = max(U, max(V, W)); +#endif /* __KERNEL_SSE2__ */ + + if(minUVW < 0.0f && maxUVW > 0.0f) { + return false; + } + + /* Calculate geometry normal and denominator. */ + const float3 Ng1 = cross(e1, e0); + //const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0); + const float3 Ng = Ng1 + Ng1; + const float den = dot3(Ng, dir); + /* Avoid division by 0. */ + if(UNLIKELY(den == 0.0f)) { + return false; + } + + /* Perform depth test. */ + const float T = dot3(v0, Ng); + const int sign_den = (__float_as_int(den) & 0x80000000); + const float sign_T = xor_signmask(T, sign_den); + if((sign_T < 0.0f) || + (sign_T > ray_t * xor_signmask(den, sign_den))) + { + return false; + } + + const float inv_den = 1.0f / den; +#ifdef __KERNEL_SSE2__ + UVWW *= inv_den; + _mm_store_ss(isect_u, UVWW); + _mm_store_ss(isect_v, shuffle<1,1,3,3>(UVWW)); +#else + *isect_u = U * inv_den; + *isect_v = V * inv_den; +#endif + *isect_t = T * inv_den; + return true; + +#undef dot3 +} + +ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, + float ray_mint, float ray_maxt, + float3 quad_P, + float3 quad_u, float3 quad_v, float3 quad_n, + float3 *isect_P, float *isect_t, + float *isect_u, float *isect_v) +{ + /* Perform intersection test. */ + float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n); + if(t < ray_mint || t > ray_maxt) { + return false; + } + const float3 hit = ray_P + t*ray_D; + const float3 inplane = hit - quad_P; + const float u = dot(inplane, quad_u) / dot(quad_u, quad_u) + 0.5f; + if(u < 0.0f || u > 1.0f) { + return false; + } + const float v = dot(inplane, quad_v) / dot(quad_v, quad_v) + 0.5f; + if(v < 0.0f || v > 1.0f) { + return false; + } + /* Store the result. */ + /* TODO(sergey): Check whether we can avoid some checks here. */ + if(isect_P != NULL) *isect_P = hit; + if(isect_t != NULL) *isect_t = t; + if(isect_u != NULL) *isect_u = u; + if(isect_v != NULL) *isect_v = v; + return true; +} + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_INTERSECT_H__ */ diff --git a/intern/cycles/util/util_md5.h b/intern/cycles/util/util_md5.h index d0af9fdb004..e4cd66c85b0 100644 --- a/intern/cycles/util/util_md5.h +++ b/intern/cycles/util/util_md5.h @@ -30,8 +30,8 @@ #ifndef __UTIL_MD5_H__ #define __UTIL_MD5_H__ -#include "util_string.h" -#include "util_types.h" +#include "util/util_string.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h index adc141a7b28..6f70a474fe7 100644 --- a/intern/cycles/util/util_optimization.h +++ b/intern/cycles/util/util_optimization.h @@ -111,7 +111,7 @@ /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>. * Since we can't avoid including <windows.h>, better only include that */ -#include "util_windows.h" +#include "util/util_windows.h" #endif diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp index 5df262fcbbb..cd3067f7650 100644 --- a/intern/cycles/util/util_path.cpp +++ b/intern/cycles/util/util_path.cpp @@ -14,10 +14,10 @@ * limitations under the License. */ -#include "util_debug.h" -#include "util_md5.h" -#include "util_path.h" -#include "util_string.h" +#include "util/util_debug.h" +#include "util/util_md5.h" +#include "util/util_path.h" +#include "util/util_string.h" #include <OpenImageIO/filesystem.h> #include <OpenImageIO/strutil.h> @@ -45,7 +45,7 @@ OIIO_NAMESPACE_USING # include <shlwapi.h> #endif -#include "util_windows.h" +#include "util/util_windows.h" CCL_NAMESPACE_BEGIN @@ -320,17 +320,18 @@ static char *path_specials(const string& sub) { static bool env_init = false; static char *env_shader_path; - static char *env_kernel_path; + static char *env_source_path; if(!env_init) { env_shader_path = getenv("CYCLES_SHADER_PATH"); - env_kernel_path = getenv("CYCLES_KERNEL_PATH"); + /* NOTE: It is KERNEL in env variable for compatibility reasons. */ + env_source_path = getenv("CYCLES_KERNEL_PATH"); env_init = true; } if(env_shader_path != NULL && sub == "shader") { return env_shader_path; } - else if(env_shader_path != NULL && sub == "kernel") { - return env_kernel_path; + else if(env_shader_path != NULL && sub == "source") { + return env_source_path; } return NULL; } @@ -814,7 +815,7 @@ string path_source_replace_includes(const string& source, /* Use line directives for better error messages. */ line = line_directive(filepath, 1) + token.replace(0, n_end + 1, "\n" + text + "\n") - + line_directive(path_join(path, source_filename), i); + + line_directive(path_join(path, source_filename), i + 1); } } } diff --git a/intern/cycles/util/util_path.h b/intern/cycles/util/util_path.h index 70dbb5ae403..0e5e2d2c837 100644 --- a/intern/cycles/util/util_path.h +++ b/intern/cycles/util/util_path.h @@ -24,10 +24,10 @@ #include <stdio.h> -#include "util_set.h" -#include "util_string.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_set.h" +#include "util/util_string.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h index 14215056840..39c1eed04e7 100644 --- a/intern/cycles/util/util_progress.h +++ b/intern/cycles/util/util_progress.h @@ -23,10 +23,10 @@ * update notifications from a job running in another thread. All methods * except for the constructor/destructor are thread safe. */ -#include "util_function.h" -#include "util_string.h" -#include "util_time.h" -#include "util_thread.h" +#include "util/util_function.h" +#include "util/util_string.h" +#include "util/util_time.h" +#include "util/util_thread.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_simd.cpp b/intern/cycles/util/util_simd.cpp index de2df612578..f90439c188b 100644 --- a/intern/cycles/util/util_simd.cpp +++ b/intern/cycles/util/util_simd.cpp @@ -19,7 +19,7 @@ (defined(WITH_KERNEL_NATIVE) && defined(__SSE2__)) #define __KERNEL_SSE2__ -#include "util_simd.h" +#include "util/util_simd.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index 756bd15ed25..557809a5719 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -20,8 +20,8 @@ #include <limits> -#include "util_debug.h" -#include "util_types.h" +#include "util/util_debug.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -451,11 +451,11 @@ ccl_device_inline int bitscan(int value) CCL_NAMESPACE_END -#include "util_math.h" -#include "util_sseb.h" -#include "util_ssei.h" -#include "util_ssef.h" -#include "util_avxf.h" +#include "util/util_math.h" +#include "util/util_sseb.h" +#include "util/util_ssei.h" +#include "util/util_ssef.h" +#include "util/util_avxf.h" #endif /* __UTIL_SIMD_TYPES_H__ */ diff --git a/intern/cycles/util/util_sky_model.cpp b/intern/cycles/util/util_sky_model.cpp index 5730986cc4f..6dda8469907 100644 --- a/intern/cycles/util/util_sky_model.cpp +++ b/intern/cycles/util/util_sky_model.cpp @@ -97,8 +97,8 @@ All instructions on how to use this code are in the accompanying header file. */ -#include "util_sky_model.h" -#include "util_sky_model_data.h" +#include "util/util_sky_model.h" +#include "util/util_sky_model_data.h" #include <assert.h> #include <stdio.h> diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h index 2f5295b5463..cf99a08efae 100644 --- a/intern/cycles/util/util_ssef.h +++ b/intern/cycles/util/util_ssef.h @@ -514,12 +514,12 @@ ccl_device_inline float len3(const ssef& a) /* faster version for SSSE3 */ typedef ssei shuffle_swap_t; -ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void) +ccl_device_inline shuffle_swap_t shuffle_swap_identity(void) { return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); } -ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void) +ccl_device_inline shuffle_swap_t shuffle_swap_swap(void) { return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); } @@ -534,12 +534,12 @@ ccl_device_inline const ssef shuffle_swap(const ssef& a, const shuffle_swap_t& s /* somewhat slower version for SSE2 */ typedef int shuffle_swap_t; -ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void) +ccl_device_inline shuffle_swap_t shuffle_swap_identity(void) { return 0; } -ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void) +ccl_device_inline shuffle_swap_t shuffle_swap_swap(void) { return 1; } diff --git a/intern/cycles/util/util_stack_allocator.h b/intern/cycles/util/util_stack_allocator.h index d7aab5b250c..2f1799a739e 100644 --- a/intern/cycles/util/util_stack_allocator.h +++ b/intern/cycles/util/util_stack_allocator.h @@ -20,8 +20,8 @@ #include <cstddef> #include <memory> -#include "util_debug.h" -#include "util_types.h" +#include "util/util_debug.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h index 033d85e8ec6..e90049254de 100644 --- a/intern/cycles/util/util_static_assert.h +++ b/intern/cycles/util/util_static_assert.h @@ -43,7 +43,9 @@ template <> class StaticAssertFailure<true> {}; # endif /* __COUNTER__ */ # endif /* C++11 or MSVC2015 */ #else /* __KERNEL_GPU__ */ -# define static_assert(statement, message) +# ifndef static_assert +# define static_assert(statement, message) +# endif #endif /* __KERNEL_GPU__ */ /* TODO(sergey): For until C++11 is a bare minimum for us, diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h index c21a8488c81..baba549753d 100644 --- a/intern/cycles/util/util_stats.h +++ b/intern/cycles/util/util_stats.h @@ -17,7 +17,7 @@ #ifndef __UTIL_STATS_H__ #define __UTIL_STATS_H__ -#include "util_atomic.h" +#include "util/util_atomic.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp index 5594aa8edb6..a1008d510d1 100644 --- a/intern/cycles/util/util_string.cpp +++ b/intern/cycles/util/util_string.cpp @@ -17,9 +17,9 @@ #include <stdarg.h> #include <stdio.h> -#include "util_foreach.h" -#include "util_string.h" -#include "util_windows.h" +#include "util/util_foreach.h" +#include "util/util_string.h" +#include "util/util_windows.h" #ifdef _WIN32 # ifndef vsnprintf diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h index 7aeed96f00b..e2c105db9c1 100644 --- a/intern/cycles/util/util_string.h +++ b/intern/cycles/util/util_string.h @@ -21,7 +21,7 @@ #include <string> #include <sstream> -#include "util_vector.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp index 87d885c44cf..a942d738b8a 100644 --- a/intern/cycles/util/util_system.cpp +++ b/intern/cycles/util/util_system.cpp @@ -14,12 +14,12 @@ * limitations under the License. */ -#include "util_system.h" +#include "util/util_system.h" -#include "util_debug.h" -#include "util_logging.h" -#include "util_types.h" -#include "util_string.h" +#include "util/util_debug.h" +#include "util/util_logging.h" +#include "util/util_types.h" +#include "util/util_string.h" #ifdef _WIN32 # if(!defined(FREE_WINDOWS)) diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h index ff61b260bed..db7a45b2d59 100644 --- a/intern/cycles/util/util_system.h +++ b/intern/cycles/util/util_system.h @@ -17,7 +17,7 @@ #ifndef __UTIL_SYSTEM_H__ #define __UTIL_SYSTEM_H__ -#include "util_string.h" +#include "util/util_string.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp index 0d1fed3ebbf..fb0c34e1dc4 100644 --- a/intern/cycles/util/util_task.cpp +++ b/intern/cycles/util/util_task.cpp @@ -14,12 +14,12 @@ * limitations under the License. */ -#include "util_debug.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_system.h" -#include "util_task.h" -#include "util_time.h" +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_system.h" +#include "util/util_task.h" +#include "util/util_time.h" //#define THREADING_DEBUG_ENABLED diff --git a/intern/cycles/util/util_task.h b/intern/cycles/util/util_task.h index 0b82f14f66f..3ebfb007e40 100644 --- a/intern/cycles/util/util_task.h +++ b/intern/cycles/util/util_task.h @@ -17,10 +17,10 @@ #ifndef __UTIL_TASK_H__ #define __UTIL_TASK_H__ -#include "util_list.h" -#include "util_string.h" -#include "util_thread.h" -#include "util_vector.h" +#include "util/util_list.h" +#include "util/util_string.h" +#include "util/util_thread.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_thread.cpp b/intern/cycles/util/util_thread.cpp index 3db8b4bd197..3dcb09804b0 100644 --- a/intern/cycles/util/util_thread.cpp +++ b/intern/cycles/util/util_thread.cpp @@ -14,10 +14,10 @@ * limitations under the License. */ -#include "util_thread.h" +#include "util/util_thread.h" -#include "util_system.h" -#include "util_windows.h" +#include "util/util_system.h" +#include "util/util_windows.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h index 427c633d2ce..1b4e87ebf03 100644 --- a/intern/cycles/util/util_thread.h +++ b/intern/cycles/util/util_thread.h @@ -32,7 +32,7 @@ # include <libkern/OSAtomic.h> #endif -#include "util_function.h" +#include "util/util_function.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_time.cpp b/intern/cycles/util/util_time.cpp index 59c963cfafb..7c39aa294bf 100644 --- a/intern/cycles/util/util_time.cpp +++ b/intern/cycles/util/util_time.cpp @@ -16,8 +16,8 @@ #include <stdlib.h> -#include "util_time.h" -#include "util_windows.h" +#include "util/util_time.h" +#include "util/util_windows.h" #ifdef _WIN32 diff --git a/intern/cycles/util/util_transform.cpp b/intern/cycles/util/util_transform.cpp index 2f10540c94e..b8f182ae962 100644 --- a/intern/cycles/util/util_transform.cpp +++ b/intern/cycles/util/util_transform.cpp @@ -46,10 +46,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "util_transform.h" +#include "util/util_transform.h" -#include "util_boundbox.h" -#include "util_math.h" +#include "util/util_boundbox.h" +#include "util/util_math.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h index a0695f20488..aef168ca64d 100644 --- a/intern/cycles/util/util_transform.h +++ b/intern/cycles/util/util_transform.h @@ -21,8 +21,8 @@ #include <string.h> #endif -#include "util_math.h" -#include "util_types.h" +#include "util/util_math.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index a000fae4bd6..bf4a134b998 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -37,6 +37,9 @@ #define ccl_device_noinline static #define ccl_global #define ccl_constant +#define ccl_local +#define ccl_local_param +#define ccl_private #define ccl_restrict __restrict #define __KERNEL_WITH_SSE_ALIGN__ @@ -82,7 +85,7 @@ /* SIMD Types */ -#include "util_optimization.h" +#include "util/util_optimization.h" #endif @@ -103,10 +106,16 @@ typedef unsigned int uint; #endif -#ifndef __KERNEL_GPU__ - /* Fixed Bits Types */ +#ifdef __KERNEL_OPENCL__ + +typedef ulong uint64_t; + +#endif + +#ifndef __KERNEL_GPU__ + #ifdef _WIN32 typedef signed char int8_t; @@ -171,7 +180,7 @@ struct ccl_try_align(16) int3 { }; __forceinline int3() {} - __forceinline int3(const __m128i a) : m128(a) {} + __forceinline int3(const __m128i& a) : m128(a) {} __forceinline operator const __m128i&(void) const { return m128; } __forceinline operator __m128i&(void) { return m128; } @@ -193,7 +202,7 @@ struct ccl_try_align(16) int4 { }; __forceinline int4() {} - __forceinline int4(const __m128i a) : m128(a) {} + __forceinline int4(const __m128i& a) : m128(a) {} __forceinline operator const __m128i&(void) const { return m128; } __forceinline operator __m128i&(void) { return m128; } @@ -265,7 +274,7 @@ struct ccl_try_align(16) float4 { }; __forceinline float4() {} - __forceinline float4(const __m128 a) : m128(a) {} + __forceinline float4(const __m128& a) : m128(a) {} __forceinline operator const __m128&(void) const { return m128; } __forceinline operator __m128&(void) { return m128; } @@ -397,11 +406,6 @@ ccl_device_inline float4 make_float4(float x, float y, float z, float w) return a; } -ccl_device_inline int align_up(int offset, int alignment) -{ - return (offset + alignment - 1) & ~(alignment - 1); -} - ccl_device_inline int3 make_int3(int i) { #ifdef __KERNEL_SSE__ @@ -476,6 +480,21 @@ ccl_device_inline int4 make_int4(const float3& f) #endif +ccl_device_inline size_t align_up(size_t offset, size_t alignment) +{ + return (offset + alignment - 1) & ~(alignment - 1); +} + +ccl_device_inline size_t round_up(size_t x, size_t multiple) +{ + return ((x + multiple - 1) / multiple) * multiple; +} + +ccl_device_inline size_t round_down(size_t x, size_t multiple) +{ + return (x / multiple) * multiple; +} + /* Interpolation types for textures * cuda also use texture space to store other objects */ enum InterpolationType { diff --git a/intern/cycles/util/util_vector.h b/intern/cycles/util/util_vector.h index 546b17570bb..4add91a3368 100644 --- a/intern/cycles/util/util_vector.h +++ b/intern/cycles/util/util_vector.h @@ -23,9 +23,9 @@ #include <cstring> #include <vector> -#include "util_aligned_malloc.h" -#include "util_guarded_allocator.h" -#include "util_types.h" +#include "util/util_aligned_malloc.h" +#include "util/util_guarded_allocator.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_view.cpp b/intern/cycles/util/util_view.cpp index 9796a5f896d..10d86167921 100644 --- a/intern/cycles/util/util_view.cpp +++ b/intern/cycles/util/util_view.cpp @@ -17,11 +17,11 @@ #include <stdio.h> #include <stdlib.h> -#include "util_opengl.h" -#include "util_string.h" -#include "util_time.h" -#include "util_version.h" -#include "util_view.h" +#include "util/util_opengl.h" +#include "util/util_string.h" +#include "util/util_time.h" +#include "util/util_version.h" +#include "util/util_view.h" #ifdef __APPLE__ #include <GLUT/glut.h> diff --git a/intern/cycles/util/util_windows.cpp b/intern/cycles/util/util_windows.cpp index 4de8483564b..073db2a27db 100644 --- a/intern/cycles/util/util_windows.cpp +++ b/intern/cycles/util/util_windows.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "util_windows.h" +#include "util/util_windows.h" #ifdef _WIN32 |